From 67961f9db8c477026ea20ce05761bde6f8bf85b0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 8 Jun 2016 15:33:42 -0700 Subject: mm/hugetlb: fix huge page reserve accounting for private mappings When creating a private mapping of a hugetlbfs file, it is possible to unmap pages via ftruncate or fallocate hole punch. If subsequent faults repopulate these mappings, the reserve counts will go negative. This is because the code currently assumes all faults to private mappings will consume reserves. The problem can be recreated as follows: - mmap(MAP_PRIVATE) a file in hugetlbfs filesystem - write fault in pages in the mapping - fallocate(FALLOC_FL_PUNCH_HOLE) some pages in the mapping - write fault in pages in the hole This will result in negative huge page reserve counts and negative subpool usage counts for the hugetlbfs. Note that this can also be recreated with ftruncate, but fallocate is more straight forward. This patch modifies the routines vma_needs_reserves and vma_has_reserves to examine the reserve map associated with private mappings similar to that for shared mappings. However, the reserve map semantics for private and shared mappings are very different. This results in subtly different code that is explained in the comments. Link: http://lkml.kernel.org/r/1464720957-15698-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Hillf Danton Cc: Dave Hansen Cc: Kirill Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Aneesh Kumar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d26162e81fea..388c2bb9b55c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -832,8 +832,27 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) * Only the process that called mmap() has reserves for * private mappings. */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return true; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + /* + * Like the shared case above, a hole punch or truncate + * could have been performed on the private mapping. + * Examine the value of chg to determine if reserves + * actually exist or were previously consumed. + * Very Subtle - The value of chg comes from a previous + * call to vma_needs_reserves(). The reserve map for + * private mappings has different (opposite) semantics + * than that of shared mappings. vma_needs_reserves() + * has already taken this difference in semantics into + * account. Therefore, the meaning of chg is the same + * as in the shared case above. Code could easily be + * combined, but keeping it separate draws attention to + * subtle differences. + */ + if (chg) + return false; + else + return true; + } return false; } @@ -1816,6 +1835,25 @@ static long __vma_reservation_common(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) return ret; + else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { + /* + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret) + return 0; + else + return 1; + } else return ret < 0 ? ret : 0; } -- cgit v1.2.3 From 91a4c272145652d798035c17e1c02c91001d3f51 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 8 Jun 2016 15:33:45 -0700 Subject: kasan: change memory hot-add error messages to info messages Change the following memory hot-add error messages to info messages. There is no need for these to be errors. kasan: WARNING: KASAN doesn't support memory hot-add kasan: Memory hot-add will be disabled Link: http://lkml.kernel.org/r/1464794430-5486-1-git-send-email-shuahkh@osg.samsung.com Signed-off-by: Shuah Khan Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 18b6a2b8d183..28439acda6ec 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -763,8 +763,8 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASAN doesn't support memory hot-add\n"); - pr_err("Memory hot-add will be disabled\n"); + pr_info("WARNING: KASAN doesn't support memory hot-add\n"); + pr_info("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); -- cgit v1.2.3 From d0db7afa1b767d95e3e14632718da5a9794129bc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 8 Jun 2016 15:33:47 -0700 Subject: revert "mm: memcontrol: fix possible css ref leak on oom" Revert commit 1383399d7be0 ("mm: memcontrol: fix possible css ref leak on oom"). Johannes points out "There is a task_in_memcg_oom() check before calling mem_cgroup_oom()". Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 58c69c94402a..75e74408cc8f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1608,7 +1608,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || current->memcg_in_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we -- cgit v1.2.3 From 770a5370226cb207461bbad902543381c1fad521 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Wed, 8 Jun 2016 15:33:50 -0700 Subject: mm: thp: broken page count after commit aa88b68c3b1d Christian Borntraeger reported a kernel panic after corrupt page counts, and it turned out to be a regression introduced with commit aa88b68c3b1d ("thp: keep huge zero page pinned until tlb flush"), at least on s390. put_huge_zero_page() was moved over from zap_huge_pmd() to release_pages(), and it was replaced by tlb_remove_page(). However, release_pages() might not always be triggered by (the arch-specific) tlb_remove_page(). On s390 we call free_page_and_swap_cache() from tlb_remove_page(), and not tlb_flush_mmu() -> free_pages_and_swap_cache() like the generic version, because we don't use the MMU-gather logic. Although both functions have very similar names, they are doing very unsimilar things, in particular free_page_xxx is just doing a put_page(), while free_pages_xxx calls release_pages(). This of course results in very harmful put_page()s on the huge zero page, on architectures where tlb_remove_page() is implemented in this way. It seems to affect only s390 and sh, but sh doesn't have THP support, so the problem (currently) probably only exists on s390. The following quick hack fixed the issue: Link: http://lkml.kernel.org/r/20160602172141.75c006a9@thinkpad Signed-off-by: Gerald Schaefer Reported-by: Christian Borntraeger Tested-by: Christian Borntraeger Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Dave Hansen Cc: Vlastimil Babka Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: [4.6.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 0d457e7db8d6..c99463ac02fb 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -252,7 +252,10 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - put_page(page); + if (is_huge_zero_page(page)) + put_huge_zero_page(); + else + put_page(page); } /* -- cgit v1.2.3 From ba62bafe942b159a6109cbec780d36496e06b6c5 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 8 Jun 2016 15:33:53 -0700 Subject: kernel/relay.c: fix potential memory leak When relay_open_buf() fails in relay_open(), code will goto free_bufs, but chan is nowhere freed. Link: http://lkml.kernel.org/r/1464777927-19675-1-git-send-email-yizhouzhou@ict.ac.cn Signed-off-by: Zhouyi Zhou Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..04d7cf3ef8cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -614,6 +614,7 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); + kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); -- cgit v1.2.3 From f3a932baa7f65072434f1c04c02c8a4d2746fcfc Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 8 Jun 2016 15:33:56 -0700 Subject: mm: introduce dedicated WQ_MEM_RECLAIM workqueue to do lru_add_drain_all This patch is based on https://patchwork.ozlabs.org/patch/574623/. Tejun submitted commit 23d11a58a9a6 ("workqueue: skip flush dependency checks for legacy workqueues") for the legacy create*_workqueue() interface. But some workq created by alloc_workqueue still reports warning on memory reclaim, e.g nvme_workq with flag WQ_MEM_RECLAIM set: workqueue: WQ_MEM_RECLAIM nvme:nvme_reset_work is flushing !WQ_MEM_RECLAIM events:lru_add_drain_per_cpu ------------[ cut here ]------------ WARNING: CPU: 0 PID: 6 at SoC/linux/kernel/workqueue.c:2448 check_flush_dependency+0xb4/0x10c ... check_flush_dependency+0xb4/0x10c flush_work+0x54/0x140 lru_add_drain_all+0x138/0x188 migrate_prep+0xc/0x18 alloc_contig_range+0xf4/0x350 cma_alloc+0xec/0x1e4 dma_alloc_from_contiguous+0x38/0x40 __dma_alloc+0x74/0x25c nvme_alloc_queue+0xcc/0x36c nvme_reset_work+0x5c4/0xda8 process_one_work+0x128/0x2ec worker_thread+0x58/0x434 kthread+0xd4/0xe8 ret_from_fork+0x10/0x50 That's because lru_add_drain_all() will schedule the drain work on system_wq, whose flag is set to 0, !WQ_MEM_RECLAIM. Introduce a dedicated WQ_MEM_RECLAIM workqueue to do lru_add_drain_all(), aiding in getting memory freed. Link: http://lkml.kernel.org/r/1464917521-9775-1-git-send-email-shhuiw@foxmail.com Signed-off-by: Wang Sheng-Hui Acked-by: Tejun Heo Cc: Keith Busch Cc: Peter Zijlstra Cc: Thierry Reding Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 95916142fc46..59f5fafa6e1f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -667,6 +667,24 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); +/* + * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM + * workqueue, aiding in getting memory freed. + */ +static struct workqueue_struct *lru_add_drain_wq; + +static int __init lru_init(void) +{ + lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); + + if (WARN(!lru_add_drain_wq, + "Failed to create workqueue lru_add_drain_wq")) + return -ENOMEM; + + return 0; +} +early_initcall(lru_init); + void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); @@ -686,7 +704,7 @@ void lru_add_drain_all(void) pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); - schedule_work_on(cpu, work); + queue_work_on(cpu, lru_add_drain_wq, work); cpumask_set_cpu(cpu, &has_work); } } -- cgit v1.2.3 From 18aba41cbfbcd138e9f6d8d446427d8b7691c194 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Wed, 8 Jun 2016 15:33:59 -0700 Subject: mm/fadvise.c: do not discard partial pages with POSIX_FADV_DONTNEED I noticed that the logic in the fadvise64_64 syscall is incorrect for partial pages. While first page of the region is correctly skipped if it is partial, the last page of the region is mistakenly discarded. This leads to problems for applications that read data in non-page-aligned chunks discarding already processed data between the reads. A somewhat misguided application that does something like write(XX bytes (non-page-alligned)); drop the data it just wrote; repeat gets a significant penalty in performance as a result. Link: http://lkml.kernel.org/r/1464917140-1506698-1-git-send-email-green@linuxhacker.ru Signed-off-by: Oleg Drokin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/fadvise.c b/mm/fadvise.c index b8024fa7101d..6c707bfe02fd 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -126,6 +126,17 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* First page is tricky as 0 - 1 = -1, but pgoff_t + * is unsigned, so the end_index >= start_index + * check below would be true and we'll discard the whole + * file cache which is not what was asked. + */ + if (end_index == 0) + break; + + end_index--; + } if (end_index >= start_index) { unsigned long count = invalidate_mapping_pages(mapping, -- cgit v1.2.3