From bb3ec6b08396bbd631b6441102dd1c3d89cbc576 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 22 May 2013 12:18:47 +0200 Subject: mm: Fix virt_to_page() warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virt_to_page() is typically implemented as a macro containing a cast so that it will accept both pointers and unsigned long without causing a warning. But MIPS virt_to_page() uses virt_to_phys which is a function so passing an unsigned long will cause a warning: CC mm/page_alloc.o mm/page_alloc.c: In function ‘free_reserved_area’: mm/page_alloc.c:5161:3: warning: passing argument 1 of ‘virt_to_phys’ makes pointer from integer without a cast [enabled by default] arch/mips/include/asm/io.h:119:100: note: expected ‘const volatile void *’ but argument is of type ‘long unsigned int’ All others users of virt_to_page() in mm/ are passing a void *. Signed-off-by: Ralf Baechle Reported-by: Eunbong Song Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-mips@linux-mips.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98cbdf6e5532..378a15bcd649 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5158,7 +5158,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end, for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { if (poison) memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + free_reserved_page(virt_to_page((void *)pos)); } if (pages && s) -- cgit v1.2.3 From d34883d4e35c0a994e91dd847a82b4c9e0c31d83 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 24 May 2013 15:55:11 -0700 Subject: mm: mmu_notifier: re-fix freed page still mapped in secondary MMU Commit 751efd8610d3 ("mmu_notifier_unregister NULL Pointer deref and multiple ->release()") breaks the fix 3ad3d901bbcf ("mm: mmu_notifier: fix freed page still mapped in secondary MMU"). Since hlist_for_each_entry_rcu() is changed now, we can not revert that patch directly, so this patch reverts the commit and simply fix the bug spotted by that patch This bug spotted by commit 751efd8610d3 is: There is a race condition between mmu_notifier_unregister() and __mmu_notifier_release(). Assume two tasks, one calling mmu_notifier_unregister() as a result of a filp_close() ->flush() callout (task A), and the other calling mmu_notifier_release() from an mmput() (task B). A B t1 srcu_read_lock() t2 if (!hlist_unhashed()) t3 srcu_read_unlock() t4 srcu_read_lock() t5 hlist_del_init_rcu() t6 synchronize_srcu() t7 srcu_read_unlock() t8 hlist_del_rcu() <--- NULL pointer deref. This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu. The another issue spotted in the commit is "multiple ->release() callouts", we needn't care it too much because it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered after exit_mmap()) and the later call of multiple ->release should be fast since all the pages have already been released by the first call. Anyway, this issue should be fixed in a separate patch. -stable suggestions: Any version that has commit 751efd8610d3 need to be backported. I find the oldest version has this commit is 3.0-stable. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Xiao Guangrong Tested-by: Robin Holt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 79 +++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index be04122fb277..6725ff183374 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_struct *mm) int id; /* - * srcu_read_lock() here will block synchronize_srcu() in - * mmu_notifier_unregister() until all registered - * ->release() callouts this function makes have - * returned. + * SRCU here will block mmu_notifier_unregister until + * ->release returns. */ id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + /* + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { mn = hlist_entry(mm->mmu_notifier_mm->list.first, struct mmu_notifier, hlist); - /* - * Unlink. This will prevent mmu_notifier_unregister() - * from also making the ->release() callout. + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. */ hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - - /* - * Clear sptes. (see 'release' description in mmu_notifier.h) - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - - spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * All callouts to ->release() which we have done are complete. - * Allow synchronize_srcu() in mmu_notifier_unregister() to complete - */ - srcu_read_unlock(&srcu, id); - - /* - * mmu_notifier_unregister() may have unlinked a notifier and may - * still be calling out to it. Additionally, other notifiers - * may have been active via vmtruncate() et. al. Block here - * to ensure that all notifier callouts for this mm have been - * completed and the sptes are really cleaned up before returning - * to exit_mmap(). + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one mm_count + * is held by exit_mmap. */ synchronize_srcu(&srcu); } @@ -292,31 +288,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { + /* + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. + */ int id; + id = srcu_read_lock(&srcu); /* - * Ensure we synchronize up with __mmu_notifier_release(). + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. */ - id = srcu_read_lock(&srcu); - - hlist_del_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - if (mn->ops->release) mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + spin_lock(&mm->mmu_notifier_mm->lock); /* - * Allow __mmu_notifier_release() to complete. + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. */ - srcu_read_unlock(&srcu, id); - } else + hlist_del_init_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); + } /* - * Wait for any running method to finish, including ->release() if it - * was run by __mmu_notifier_release() instead of us. + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. */ synchronize_srcu(&srcu); -- cgit v1.2.3 From 28ccddf7952c496df2a51ce5aee4f2a058a98bab Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 May 2013 15:55:15 -0700 Subject: mm: memcg: remove incorrect VM_BUG_ON for swap cache pages in uncharge Commit 0c59b89c81ea ("mm: memcg: push down PageSwapCache check into uncharge entry functions") added a VM_BUG_ON() on PageSwapCache in the uncharge path after checking that page flag once, assuming that the state is stable in all paths, but this is not the case and the condition triggers in user environments. An uncharge after the last page table reference to the page goes away can race with reclaim adding the page to swap cache. Swap cache pages are usually uncharged when they are freed after swapout, from a path that also handles swap usage accounting and memcg lifetime management. However, since the last page table reference is gone and thus no references to the swap slot left, the swap slot will be freed shortly when reclaim attempts to write the page to disk. The whole swap accounting is not even necessary. So while the race condition for which this VM_BUG_ON was added is real and actually existed all along, there are no negative effects. Remove the VM_BUG_ON again. Reported-by: Heiko Carstens Reported-by: Lingzhu Xiang Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cb1c9dedf9b6..010d6c14129a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4108,8 +4108,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, if (mem_cgroup_disabled()) return NULL; - VM_BUG_ON(PageSwapCache(page)); - if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); @@ -4205,6 +4203,18 @@ void mem_cgroup_uncharge_page(struct page *page) if (page_mapped(page)) return; VM_BUG_ON(page->mapping && !PageAnon(page)); + /* + * If the page is in swap cache, uncharge should be deferred + * to the swap path, which also properly accounts swap usage + * and handles memcg lifetime. + * + * Note that this check is not stable and reclaim may add the + * page to swap cache at any time after this. However, if the + * page is not in swap cache by the time page->mapcount hits + * 0, there won't be any page table references to the swap + * slot, and reclaim will free it and not actually write the + * page to disk. + */ if (PageSwapCache(page)) return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); -- cgit v1.2.3 From c2cc499c5bcf9040a738f49e8051b42078205748 Mon Sep 17 00:00:00 2001 From: Leonid Yegoshin Date: Fri, 24 May 2013 15:55:18 -0700 Subject: mm compaction: fix of improper cache flush in migration code Page 'new' during MIGRATION can't be flushed with flush_cache_page(). Using flush_cache_page(vma, addr, pfn) is justified only if the page is already placed in process page table, and that is done right after flush_cache_page(). But without it the arch function has no knowledge of process PTE and does nothing. Besides that, flush_cache_page() flushes an application cache page, but the kernel has a different page virtual address and dirtied it. Replace it with flush_dcache_page(new) which is the proper usage. The old page is flushed in try_to_unmap_one() before migration. This bug takes place in Sead3 board with M14Kc MIPS CPU without cache aliasing (but Harvard arch - separate I and D cache) in tight memory environment (128MB) each 1-3days on SOAK test. It fails in cc1 during kernel build (SIGILL, SIGBUS, SIGSEG) if CONFIG_COMPACTION is switched ON. Signed-off-by: Leonid Yegoshin Cc: Leonid Yegoshin Acked-by: Rik van Riel Cc: Michal Hocko Acked-by: Mel Gorman Cc: Ralf Baechle Cc: Russell King Cc: David Miller Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 27ed22579fd9..b1f57501de9c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -165,7 +165,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = arch_make_huge_pte(pte, vma, new, 0); } #endif - flush_cache_page(vma, addr, pte_pfn(pte)); + flush_dcache_page(new); set_pte_at(mm, addr, ptep, pte); if (PageHuge(new)) { -- cgit v1.2.3 From 7c3425123ddfdc5f48e7913ff59d908789712b18 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 24 May 2013 15:55:21 -0700 Subject: mm/THP: use pmd_populate() to update the pmd with pgtable_t pointer We should not use set_pmd_at to update pmd_t with pgtable_t pointer. set_pmd_at is used to set pmd with huge pte entries and architectures like ppc64, clear few flags from the pte when saving a new entry. Without this change we observe bad pte errors like below on ppc64 with THP enabled. BUG: Bad page map in process ld mm=0xc000001ee39f4780 pte:7fc3f37848000001 pmd:c000001ec0000000 Signed-off-by: Aneesh Kumar K.V Cc: Hugh Dickins Cc: Benjamin Herrenschmidt Reviewed-by: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03a89a2f464b..362c329b83fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2325,7 +2325,12 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(vma->anon_vma); goto out; -- cgit v1.2.3 From 348f9f05e0266822fa048f7fb3b039692a0cafbc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 May 2013 15:55:30 -0700 Subject: mm/memory_hotplug.c: fix printk format warnings Fix printk format warnings in mm/memory_hotplug.c by using "%pa": mm/memory_hotplug.c: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 2 has type 'resource_size_t' [-Wformat] mm/memory_hotplug.c: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 3 has type 'resource_size_t' [-Wformat] Signed-off-by: Randy Dunlap Reported-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a221fac1f47d..1ad92b46753e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -720,9 +720,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, start = phys_start_pfn << PAGE_SHIFT; size = nr_pages * PAGE_SIZE; ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) - pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n", - start, start + size - 1, ret); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { -- cgit v1.2.3 From a9ff785e4437c83d2179161e012f5bdfbd6381f0 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 24 May 2013 15:55:36 -0700 Subject: mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas A panic can be caused by simply cat'ing /proc//smaps while an application has a VM_PFNMAP range. It happened in-house when a benchmarker was trying to decipher the memory layout of his program. /proc//smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memory char/mspec.c - and probably several out-of-tree modules [akpm@linux-foundation.org: remove now-unused hugetlb_vma() stub] Signed-off-by: Cliff Wickman Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Dave Hansen Cc: David Sterba Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: "Kirill A. Shutemov" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 70 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 35aa294656cd..5da2cbcfdbb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - return NULL; -} - static int walk_hugetlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* - * handle hugetlb vma individually because pagetable walk for - * the hugetlb page is dependent on the architecture and - * we can't handled it in the same manner as non-huge pages. + * This function was not intended to be vma based. + * But there are vma special cases to be handled: + * - hugetlb vma's + * - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* + * There are no page structures backing a VM_PFNMAP + * range, so do not allow split_huge_page_pmd(). + */ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* - * Hugepage is very tightly coupled with vma, so - * walk through hugetlb entries within a given vma. + * Handle hugetlb vma individually because pagetable + * walk for the hugetlb page is dependent on the + * architecture and we can't handled it in the same + * manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, + * so walk through hugetlb entries within a + * given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } } if (pgd_none_or_clear_bad(pgd)) { -- cgit v1.2.3