From b059a453b1cf1c8453c2b2ed373d3147d6264ebd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 28 Jun 2016 14:35:38 +0300 Subject: x86/vdso: Add mremap hook to vm_special_mapping Add possibility for 32-bit user-space applications to move the vDSO mapping. Previously, when a user-space app called mremap() for the vDSO address, in the syscall return path it would land on the previous address of the vDSOpage, resulting in segmentation violation. Now it lands fine and returns to userspace with a remapped vDSO. This will also fix the context.vdso pointer for 64-bit, which does not affect the user of vDSO after mremap() currently, but this may change in the future. As suggested by Andy, return -EINVAL for mremap() that would split the vDSO image: that operation cannot possibly result in a working system so reject it. Renamed and moved the text_mapping structure declaration inside map_vdso(), as it used only there and now it complements the vvar_mapping variable. There is still a problem for remapping the vDSO in glibc applications: the linker relocates addresses for syscalls on the vDSO page, so you need to relink with the new addresses. Without that the next syscall through glibc may fail: Program received signal SIGSEGV, Segmentation fault. #0 0xf7fd9b80 in __kernel_vsyscall () #1 0xf7ec8238 in _exit () from /usr/lib32/libc.so.6 Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160628113539.13606-2-dsafonov@virtuozzo.com Signed-off-by: Ingo Molnar --- mm/mmap.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index de2c1769cc68..234edffec1d0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2943,9 +2943,19 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } +static int special_mapping_mremap(struct vm_area_struct *new_vma) +{ + struct vm_special_mapping *sm = new_vma->vm_private_data; + + if (sm->mremap) + return sm->mremap(sm, new_vma); + return 0; +} + static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, + .mremap = special_mapping_mremap, .name = special_mapping_name, }; -- cgit v1.2.3 From 7f556567036cb7f89aabe2f0954b08566b4efb53 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 10 Jul 2016 16:46:32 -0700 Subject: tmpfs: fix regression hang in fallocate undo The well-spotted fallocate undo fix is good in most cases, but not when fallocate failed on the very first page. index 0 then passes lend -1 to shmem_undo_range(), and that has two bad effects: (a) that it will undo every fallocation throughout the file, unrestricted by the current range; but more importantly (b) it can cause the undo to hang, because lend -1 is treated as truncation, which makes it keep on retrying until every page has gone, but those already fully instantiated will never go away. Big thank you to xfstests generic/269 which demonstrates this. Fixes: b9b4bb26af01 ("tmpfs: don't undo fallocate past its last page") Cc: stable@vger.kernel.org Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 24463b67b6ef..171dee7a131f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2225,9 +2225,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ - shmem_undo_range(inode, - (loff_t)start << PAGE_SHIFT, - ((loff_t)index << PAGE_SHIFT) - 1, true); + if (index > start) { + shmem_undo_range(inode, + (loff_t)start << PAGE_SHIFT, + ((loff_t)index << PAGE_SHIFT) - 1, true); + } goto undone; } -- cgit v1.2.3 From a46cbf3bc53b6a93fb84a5ffb288c354fa807954 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 14 Jul 2016 12:06:50 -0700 Subject: mm, compaction: prevent VM_BUG_ON when terminating freeing scanner It's possible to isolate some freepages in a pageblock and then fail split_free_page() due to the low watermark check. In this case, we hit VM_BUG_ON() because the freeing scanner terminated early without a contended lock or enough freepages. This should never have been a VM_BUG_ON() since it's not a fatal condition. It should have been a VM_WARN_ON() at best, or even handled gracefully. Regardless, we need to terminate anytime the full pageblock scan was not done. The logic belongs in isolate_freepages_block(), so handle its state gracefully by terminating the pageblock loop and making a note to restart at the same pageblock next time since it was not possible to complete the scan this time. [rientjes@google.com: don't rescan pages in a pageblock] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1607111244150.83138@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1606291436300.145590@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Minchan Kim Tested-by: Minchan Kim Cc: Joonsoo Kim Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 79bfe0e06907..7bc04778f84d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1009,8 +1009,6 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - unsigned long isolated; - /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need @@ -1034,36 +1032,30 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); - /* If isolation failed early, do not continue needlessly */ - if (!isolated && isolate_start_pfn < block_end_pfn && - cc->nr_migratepages > cc->nr_freepages) - break; + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, + freelist, false); /* - * If we isolated enough freepages, or aborted due to async - * compaction being contended, terminate the loop. - * Remember where the free scanner should restart next time, - * which is where isolate_freepages_block() left off. - * But if it scanned the whole pageblock, isolate_start_pfn - * now points at block_end_pfn, which is the start of the next - * pageblock. - * In that case we will however want to restart at the start - * of the previous pageblock. + * If we isolated enough freepages, or aborted due to lock + * contention, terminate. */ if ((cc->nr_freepages >= cc->nr_migratepages) || cc->contended) { - if (isolate_start_pfn >= block_end_pfn) + if (isolate_start_pfn >= block_end_pfn) { + /* + * Restart at previous pageblock if more + * freepages can be isolated next time. + */ isolate_start_pfn = block_start_pfn - pageblock_nr_pages; + } break; - } else { + } else if (isolate_start_pfn < block_end_pfn) { /* - * isolate_freepages_block() should not terminate - * prematurely unless contended, or isolated enough + * If isolation failed early, do not continue + * needlessly. */ - VM_BUG_ON(isolate_start_pfn < block_end_pfn); + break; } } -- cgit v1.2.3 From 9818b8cde6221fe7a52aad816425f67d4439b14e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 14 Jul 2016 12:07:12 -0700 Subject: madvise_free, thp: fix madvise_free_huge_pmd return value after splitting madvise_free_huge_pmd should return 0 if the fallback PTE operations are required. In madvise_free_huge_pmd, if part pages of THP are discarded, the THP will be split and fallback PTE operations should be used if splitting succeeds. But the original code will make fallback PTE operations skipped, after splitting succeeds. Fix that via make madvise_free_huge_pmd return 0 after splitting successfully, so that the fallback PTE operations will be done. Link: http://lkml.kernel.org/r/1467135452-16688-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Minchan Kim Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Andrea Arcangeli Cc: Ebru Akagunduz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ed58530f695..55940d173318 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1624,14 +1624,9 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (next - addr != HPAGE_PMD_SIZE) { get_page(page); spin_unlock(ptl); - if (split_huge_page(page)) { - put_page(page); - unlock_page(page); - goto out_unlocked; - } + split_huge_page(page); put_page(page); unlock_page(page); - ret = 1; goto out_unlocked; } -- cgit v1.2.3 From 0ab686d8c8303069e80300663b3be6201a8697fb Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 14 Jul 2016 12:07:17 -0700 Subject: kasan/quarantine: fix bugs on qlist_move_cache() There are two bugs on qlist_move_cache(). One is that qlist's tail isn't set properly. curr->next can be NULL since it is singly linked list and NULL value on tail is invalid if there is one item on qlist. Another one is that if cache is matched, qlist_put() is called and it will set curr->next to NULL. It would cause to stop the loop prematurely. These problems come from complicated implementation so I'd like to re-implement it completely. Implementation in this patch is really simple. Iterate all qlist_nodes and put them to appropriate list. Unfortunately, I got this bug sometime ago and lose oops message. But, the bug looks trivial and no need to attach oops. Fixes: 55834c59098d ("mm: kasan: initial memory quarantine implementation") Link: http://lkml.kernel.org/r/1467766348-22419-1-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Dmitry Vyukov Acked-by: Andrey Ryabinin Acked-by: Alexander Potapenko Cc: Kuthonuzo Luruo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 4973505a9bdd..65793f150d1f 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -238,30 +238,23 @@ static void qlist_move_cache(struct qlist_head *from, struct qlist_head *to, struct kmem_cache *cache) { - struct qlist_node *prev = NULL, *curr; + struct qlist_node *curr; if (unlikely(qlist_empty(from))) return; curr = from->head; + qlist_init(from); while (curr) { - struct qlist_node *qlink = curr; - struct kmem_cache *obj_cache = qlink_to_cache(qlink); - - if (obj_cache == cache) { - if (unlikely(from->head == qlink)) { - from->head = curr->next; - prev = curr; - } else - prev->next = curr->next; - if (unlikely(from->tail == qlink)) - from->tail = curr->next; - from->bytes -= cache->size; - qlist_put(to, qlink, cache->size); - } else { - prev = curr; - } - curr = curr->next; + struct qlist_node *next = curr->next; + struct kmem_cache *obj_cache = qlink_to_cache(curr); + + if (obj_cache == cache) + qlist_put(to, curr, obj_cache->size); + else + qlist_put(from, curr, obj_cache->size); + + curr = next; } } -- cgit v1.2.3 From e4568d3803852d00effd41dcdd489e726b998879 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jul 2016 12:07:20 -0700 Subject: mm, meminit: always return a valid node from early_pfn_to_nid early_pfn_to_nid can return node 0 if a PFN is invalid on machines that has no node 0. A machine with only node 1 was observed to crash with the following message: BUG: unable to handle kernel paging request at 000000000002a3c8 PGD 0 Modules linked in: Hardware name: Supermicro H8DSP-8/H8DSP-8, BIOS 080011 06/30/2006 task: ffffffff81c0d500 ti: ffffffff81c00000 task.ti: ffffffff81c00000 RIP: reserve_bootmem_region+0x6a/0xef CR2: 000000000002a3c8 CR3: 0000000001c06000 CR4: 00000000000006b0 Call Trace: free_all_bootmem+0x4b/0x12a mem_init+0x70/0xa3 start_kernel+0x25b/0x49b The problem is that early_page_uninitialised uses the early_pfn_to_nid helper which returns node 0 for invalid PFNs. No caller of early_pfn_to_nid cares except early_page_uninitialised. This patch has early_pfn_to_nid always return a valid node. Link: http://lkml.kernel.org/r/1468008031-3848-3-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6903b695ebae..5d013526bd0a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1273,7 +1273,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; -- cgit v1.2.3 From ef70b6f41cda6270165a6f27b2548ed31cfa3cb2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jul 2016 12:07:23 -0700 Subject: mm, meminit: ensure node is online before checking whether pages are uninitialised early_page_uninitialised looks up an arbitrary PFN. While a machine without node 0 will boot with "mm, page_alloc: Always return a valid node from early_pfn_to_nid", it works because it assumes that nodes are always in PFN order. This is not guaranteed so this patch adds robustness by always checking if the node being checked is online. Link: http://lkml.kernel.org/r/1468008031-3848-4-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d013526bd0a..8b3e1341b754 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,7 +286,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) + int nid = early_pfn_to_nid(pfn); + + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; -- cgit v1.2.3 From 33f4751e99601b7bfd1d66aedabd3ee9140922de Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2016 12:07:32 -0700 Subject: mm: thp: move pmd check inside ptl for freeze_page() I found a race condition triggering VM_BUG_ON() in freeze_page(), when running a testcase with 3 processes: - process 1: keep writing thp, - process 2: keep clearing soft-dirty bits from virtual address of process 1 - process 3: call migratepages for process 1, The kernel message is like this: kernel BUG at /src/linux-dev/mm/huge_memory.c:3096! invalid opcode: 0000 [#1] SMP Modules linked in: cfg80211 rfkill crc32c_intel ppdev serio_raw pcspkr virtio_balloon virtio_console parport_pc parport pvpanic acpi_cpufreq tpm_tis tpm i2c_piix4 virtio_blk virtio_net ata_generic pata_acpi floppy virtio_pci virtio_ring virtio CPU: 0 PID: 28863 Comm: migratepages Not tainted 4.6.0-v4.6-160602-0827-+ #2 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880037320000 ti: ffff88007cdd0000 task.ti: ffff88007cdd0000 RIP: 0010:[] [] split_huge_page_to_list+0x496/0x590 RSP: 0018:ffff88007cdd3b70 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff88007c7b88c0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000700000200 RDI: ffffea0003188000 RBP: ffff88007cdd3bb8 R08: 0000000000000001 R09: 00003ffffffff000 R10: ffff880000000000 R11: ffffc000001fffff R12: ffffea0003188000 R13: ffffea0003188000 R14: 0000000000000000 R15: 0400000000000080 FS: 00007f8ec241d740(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8ec1f3ed20 CR3: 000000003707b000 CR4: 00000000000006f0 Call Trace: ? list_del+0xd/0x30 queue_pages_pte_range+0x4d1/0x590 __walk_page_range+0x204/0x4e0 walk_page_range+0x71/0xf0 queue_pages_range+0x75/0x90 ? queue_pages_hugetlb+0x190/0x190 ? new_node_page+0xc0/0xc0 ? change_prot_numa+0x40/0x40 migrate_to_node+0x71/0xd0 do_migrate_pages+0x1c3/0x210 SyS_migrate_pages+0x261/0x290 entry_SYSCALL_64_fastpath+0x1a/0xa4 Code: e8 b0 87 fb ff 0f 0b 48 c7 c6 30 32 9f 81 e8 a2 87 fb ff 0f 0b 48 c7 c6 b8 46 9f 81 e8 94 87 fb ff 0f 0b 85 c0 0f 84 3e fd ff ff <0f> 0b 85 c0 0f 85 a6 00 00 00 48 8b 75 c0 4c 89 f7 41 be f0 ff RIP split_huge_page_to_list+0x496/0x590 I'm not sure of the full scenario of the reproduction, but my debug showed that split_huge_pmd_address(freeze=true) returned without running main code of pmd splitting because pmd_present(*pmd) in precheck somehow returned 0. If this happens, the subsequent try_to_unmap() fails and returns non-zero (because page_mapcount() still > 0), and finally VM_BUG_ON() fires. This patch tries to fix it by prechecking pmd state inside ptl. Link: http://lkml.kernel.org/r/1466990929-7452-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 31 ++++++++++++------------------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 419fb9e03447..f0a7a0320300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,7 +94,7 @@ static inline int split_huge_page(struct page *page) void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze); + unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ @@ -102,7 +102,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ - false); \ + false, NULL); \ } while (0) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 55940d173318..343a2b7e57aa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2984,7 +2984,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze) + unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; @@ -2992,8 +2992,17 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + goto out; + if (pmd_trans_huge(*pmd)) { - struct page *page = pmd_page(*pmd); + page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); } else if (!pmd_devmap(*pmd)) @@ -3020,24 +3029,8 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, return; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) - return; - /* - * If caller asks to setup a migration entries, we need a page to check - * pmd against. Otherwise we can end up replacing wrong page. - */ - VM_BUG_ON(freeze && !page); - if (page && page != pmd_page(*pmd)) - return; - - /* - * Caller holds the mmap_sem write mode or the anon_vma lock, - * so a huge pmd cannot materialize from under us (khugepaged - * holds both the mmap_sem write mode and the anon_vma lock - * write mode). - */ - __split_huge_pmd(vma, pmd, address, freeze); + __split_huge_pmd(vma, pmd, address, freeze, page); } void vma_adjust_trans_huge(struct vm_area_struct *vma, -- cgit v1.2.3 From 55bda43bb26d2c11eeedac742eff87a8ac34c106 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2016 12:07:35 -0700 Subject: mm: rmap: call page_check_address() with sync enabled to avoid racy check The previous patch addresses the race between split_huge_pmd_address() and someone changing the pmd. The fix is only for splitting of normal thp (i.e. pmd-mapped thp,) and for splitting of pte-mapped thp there still is the similar race. For splitting pte-mapped thp, the pte's conversion is done by try_to_unmap_one(TTU_MIGRATION). This function checks page_check_address() to get the target pte, but it can return NULL under some race, leading to VM_BUG_ON() in freeze_page(). Fortunately, page_check_address() already has an argument to decide whether we do a quick/racy check or not, so let's flip it when called from freeze_page(). Link: http://lkml.kernel.org/r/1466990929-7452-2-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 0ea5d9071b32..e4b713a6ed7e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1427,7 +1427,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, goto out; } - pte = page_check_address(page, mm, address, &ptl, 0); + pte = page_check_address(page, mm, address, &ptl, + PageTransCompound(page)); if (!pte) goto out; -- cgit v1.2.3 From 5a49973d7143ebbabd76e1dcd69ee42e349bb7b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 14 Jul 2016 12:07:38 -0700 Subject: mm: thp: refix false positive BUG in page_move_anon_rmap() The VM_BUG_ON_PAGE in page_move_anon_rmap() is more trouble than it's worth: the syzkaller fuzzer hit it again. It's still wrong for some THP cases, because linear_page_index() was never intended to apply to addresses before the start of a vma. That's easily fixed with a signed long cast inside linear_page_index(); and Dmitry has tested such a patch, to verify the false positive. But why extend linear_page_index() just for this case? when the avoidance in page_move_anon_rmap() has already grown ugly, and there's no reason for the check at all (nothing else there is using address or index). Remove address arg from page_move_anon_rmap(), remove VM_BUG_ON_PAGE, remove CONFIG_DEBUG_VM PageTransHuge adjustment. And one more thing: should the compound_head(page) be done inside or outside page_move_anon_rmap()? It's usually pushed down to the lowest level nowadays (and mm/memory.c shows no other explicit use of it), so I think it's better done in page_move_anon_rmap() than by caller. Fixes: 0798d3c022dc ("mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap()") Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1607120444540.12528@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Dmitry Vyukov Acked-by: Kirill A. Shutemov Cc: Mika Westerberg Cc: Andrea Arcangeli Cc: Rik van Riel Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 3 +-- mm/rmap.c | 9 +++------ 4 files changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 49eb4f8ebac9..2b0fad83683f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -158,7 +158,7 @@ struct anon_vma *page_get_anon_vma(struct page *page); /* * rmap interfaces called when adding or removing pte of page */ -void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c1f3c0be150a..addfe4accc07 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3383,7 +3383,7 @@ retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - page_move_anon_rmap(old_page, vma, address); + page_move_anon_rmap(old_page, vma); set_huge_ptep_writable(vma, address, ptep); return 0; } diff --git a/mm/memory.c b/mm/memory.c index cd1f29e4897e..9e046819e619 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2399,8 +2399,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(compound_head(old_page), - vma, address); + page_move_anon_rmap(old_page, vma); } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, diff --git a/mm/rmap.c b/mm/rmap.c index e4b713a6ed7e..701b93fea2a0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1084,23 +1084,20 @@ EXPORT_SYMBOL_GPL(page_mkclean); * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma * @vma: the vma the page belongs to - * @address: the user virtual address mapped * * When a page belongs exclusively to one process after a COW event, * that page can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling * processes. */ -void page_move_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) +void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + page = compound_head(page); + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_VMA(!anon_vma, vma); - if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page)) - address &= HPAGE_PMD_MASK; - VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; /* -- cgit v1.2.3 From d3d36c4b5c5fbdd68542be73ffd53eb210fbf7cf Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 14 Jul 2016 12:07:41 -0700 Subject: mm: workingset: printk missing log level, use pr_info() Commit 612e44939c3c ("mm: workingset: eviction buckets for bigmem/lowbit machines") added a printk without a log level. Quieten it by using pr_info(). Link: http://lkml.kernel.org/r/1466982072-29836-2-git-send-email-anton@ozlabs.org Signed-off-by: Anton Blanchard Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index 8a75f8d2916a..577277546d98 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -491,7 +491,7 @@ static int __init workingset_init(void) max_order = fls_long(totalram_pages - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; - printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); -- cgit v1.2.3 From 73f576c04b9410ed19660f74f97521bee6e1c546 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jul 2016 15:44:57 -0700 Subject: mm: memcontrol: fix cgroup creation failure after many small jobs The memory controller has quite a bit of state that usually outlives the cgroup and pins its CSS until said state disappears. At the same time it imposes a 16-bit limit on the CSS ID space to economically store IDs in the wild. Consequently, when we use cgroups to contain frequent but small and short-lived jobs that leave behind some page cache, we quickly run into the 64k limitations of outstanding CSSs. Creating a new cgroup fails with -ENOSPC while there are only a few, or even no user-visible cgroups in existence. Although pinning CSSs past cgroup removal is common, there are only two instances that actually need an ID after a cgroup is deleted: cache shadow entries and swapout records. Cache shadow entries reference the ID weakly and can deal with the CSS having disappeared when it's looked up later. They pose no hurdle. Swap-out records do need to pin the css to hierarchically attribute swapins after the cgroup has been deleted; though the only pages that remain swapped out after offlining are tmpfs/shmem pages. And those references are under the user's control, so they are manageable. This patch introduces a private 16-bit memcg ID and switches swap and cache shadow entries over to using that. This ID can then be recycled after offlining when the CSS remains pinned only by objects that don't specifically need it. This script demonstrates the problem by faulting one cache page in a new cgroup and deleting it again: set -e mkdir -p pages for x in `seq 128000`; do [ $((x % 1000)) -eq 0 ] && echo $x mkdir /cgroup/foo echo $$ >/cgroup/foo/cgroup.procs echo trex >pages/$x echo $$ >/cgroup/cgroup.procs rmdir /cgroup/foo done When run on an unpatched kernel, we eventually run out of possible IDs even though there are no visible cgroups: [root@ham ~]# ./cssidstress.sh [...] 65000 mkdir: cannot create directory '/cgroup/foo': No space left on device After this patch, the IDs get released upon cgroup destruction and the cache and css objects get released once memory reclaim kicks in. [hannes@cmpxchg.org: init the IDR] Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: John Garcia Reviewed-by: Vladimir Davydov Acked-by: Tejun Heo Cc: Nikolay Borisov Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 ++++++-------- mm/memcontrol.c | 82 ++++++++++++++++++++++++++++++++++++++++++---- mm/slab_common.c | 4 +-- 3 files changed, 87 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a805474df4ab..56e6069d2452 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -97,6 +97,11 @@ enum mem_cgroup_events_target { #define MEM_CGROUP_ID_SHIFT 16 #define MEM_CGROUP_ID_MAX USHRT_MAX +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -172,6 +177,9 @@ enum memcg_kmem_state { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter swap; @@ -330,22 +338,9 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) if (mem_cgroup_disabled()) return 0; - return memcg->css.id; -} - -/** - * mem_cgroup_from_id - look up a memcg from an id - * @id: the id to look up - * - * Caller must hold rcu_read_lock() and use css_tryget() as necessary. - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } +struct mem_cgroup *mem_cgroup_from_id(unsigned short id); /** * parent_mem_cgroup - find the accounting parent of a memcg diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac8664db3823..5339c89dff63 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4057,6 +4057,60 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4116,6 +4170,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return NULL; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto fail; + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto fail; @@ -4142,8 +4202,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: + if (memcg->id.id > 0) + idr_remove(&mem_cgroup_idr, memcg->id.id); mem_cgroup_free(memcg); return NULL; } @@ -4206,12 +4269,11 @@ fail: return ERR_PTR(-ENOMEM); } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); return 0; } @@ -4234,6 +4296,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5756,6 +5820,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5774,6 +5839,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /* @@ -5804,11 +5872,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) !page_counter_try_charge(&memcg->swap, 1, &counter)) return -ENOMEM; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); return 0; } @@ -5837,7 +5905,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) page_counter_uncharge(&memcg->memsw, 1); } mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/mm/slab_common.c b/mm/slab_common.c index a65dad7fdcd1..82317abb03ed 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock; -- cgit v1.2.3