From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 2 +- arch/ia64/mm/init.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/ia64') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f388b4e18a37..ea39eba61ef5 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t */ vma->vm_mm = mm; vma->vm_file = get_file(filp); - vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; + vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ /* diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 0eab454867a2..082e383c1b6f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -138,7 +138,8 @@ ia64_init_addr_space (void) vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; + vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP; down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); -- cgit v1.2.3 From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 8 Oct 2012 16:29:32 -0700 Subject: mm: hugetlb: add arch hook for clearing page flags before entering pool The core page allocator ensures that page flags are zeroed when freeing pages via free_pages_check. A number of architectures (ARM, PPC, MIPS) rely on this property to treat new pages as dirty with respect to the data cache and perform the appropriate flushing before mapping the pages into userspace. This can lead to cache synchronisation problems when using hugepages, since the allocator keeps its own pool of pages above the usual page allocator and does not reset the page flags when freeing a page into the pool. This patch adds a new architecture hook, arch_clear_hugepage_flags, so that architectures which rely on the page flags being in a particular state for fresh allocations can adjust the flags accordingly when a page is freed into the pool. Signed-off-by: Will Deacon Cc: Michal Hocko Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/hugetlb.h | 4 ++++ arch/mips/include/asm/hugetlb.h | 4 ++++ arch/powerpc/include/asm/hugetlb.h | 4 ++++ arch/s390/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 6 ++++++ arch/sparc/include/asm/hugetlb.h | 4 ++++ arch/tile/include/asm/hugetlb.h | 4 ++++ arch/x86/include/asm/hugetlb.h | 4 ++++ mm/hugetlb.c | 1 + 9 files changed, 32 insertions(+) (limited to 'arch/ia64') diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index da55c63728e0..94eaa5bd5d0c 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -77,4 +77,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 58d36889f09b..bd94946a18f3 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -112,4 +112,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dfdb95bc59a5..62e11a32c4c2 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -151,6 +151,10 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #else /* ! CONFIG_HUGETLB_PAGE */ static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d6e6e380564..fc322421b1cc 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -33,6 +33,7 @@ static inline int prepare_hugepage_range(struct file *file, } #define hugetlb_prefault_arch_hook(mm) do { } while (0) +#define arch_clear_hugepage_flags(page) do { } while (0) int arch_prepare_hugepage(struct page *page); void arch_release_hugepage(struct page *page); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 967068fb79ac..b3808c7d67b2 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -1,6 +1,7 @@ #ifndef _ASM_SH_HUGETLB_H #define _ASM_SH_HUGETLB_H +#include #include @@ -89,4 +90,9 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ + clear_bit(PG_dcache_clean, &page->flags); +} + #endif /* _ASM_SH_HUGETLB_H */ diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 177061064ee6..e7927c9758a1 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -82,4 +82,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_SPARC64_HUGETLB_H */ diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index b2042380a5aa..0f885af2b621 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h @@ -106,6 +106,10 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #ifdef CONFIG_HUGETLB_SUPER_PAGES static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 439a9acc132d..bdd35dbd0605 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_X86_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc727122dd44..f1bb534254f6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,6 +637,7 @@ static void free_huge_page(struct page *page) h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); -- cgit v1.2.3 From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:19 -0700 Subject: readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault.c | 1 + arch/avr32/mm/fault.c | 1 + arch/cris/mm/fault.c | 1 + arch/hexagon/mm/vm_fault.c | 1 + arch/ia64/mm/fault.c | 1 + arch/m68k/mm/fault.c | 1 + arch/microblaze/mm/fault.c | 1 + arch/mips/mm/fault.c | 1 + arch/openrisc/mm/fault.c | 1 + arch/powerpc/mm/fault.c | 1 + arch/s390/mm/fault.c | 1 + arch/sh/mm/fault.c | 1 + arch/sparc/mm/fault_32.c | 1 + arch/sparc/mm/fault_64.c | 1 + arch/tile/mm/fault.c | 1 + arch/um/kernel/trap.c | 1 + arch/x86/mm/fault.c | 1 + arch/xtensa/mm/fault.c | 1 + include/linux/mm.h | 1 + mm/filemap.c | 4 ++-- 20 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/ia64') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c3bd83450227..5dbf13f954f6 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -336,6 +336,7 @@ retry: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index b92e60958617..b2f2d2d66849 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -152,6 +152,7 @@ good_area: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would have diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 45fd542cf173..73312ab6c696 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -186,6 +186,7 @@ retry: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 06695cc4fe58..513b74cb397e 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -113,6 +113,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 8443daf4f515..6cf0341f978e 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -184,6 +184,7 @@ retry: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index aeebbb7b30f0..a563727806bf 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -170,6 +170,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index eb365d6795fa..714b35a9c4f7 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -233,6 +233,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index c14f6dfed995..9f513486af10 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -171,6 +171,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 40f850e9766c..e2bfafce66c5 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -183,6 +183,7 @@ good_area: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5495ebe983a2..0a6b28336eb0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -451,6 +451,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ac9122ca1152..04ad4001a289 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -367,6 +367,7 @@ retry: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; } diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 3bdc1ad9a341..cbbdcad8fcb3 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -504,6 +504,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 77ac917be152..e98bfda205a2 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -265,6 +265,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 1fe0429b6314..413d29263304 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -452,6 +452,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 84ce7abbf5af..fe811fa5f1b9 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -454,6 +454,7 @@ good_area: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 0353b98ae35a..0f00e9c82080 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -89,6 +89,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a530b230e7d7..8e13ecb41bee 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1220,6 +1220,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 5a74c53bc69c..2c2f710ed1dc 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -126,6 +126,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/include/linux/mm.h b/include/linux/mm.h index b01e585ab4b5..bcaab4e6fe91 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -161,6 +161,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x40 /* second try */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/filemap.c b/mm/filemap.c index a9827b42556e..83efee76a5c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page)) { + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - } else { + } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); -- cgit v1.2.3 From 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Mon, 8 Oct 2012 16:33:06 -0700 Subject: mm: fix-up zone present pages I think zone->present_pages indicates pages that buddy system can management, it should be: zone->present_pages = spanned pages - absent pages - bootmem pages, but is now: zone->present_pages = spanned pages - absent pages - memmap pages. spanned pages: total size, including holes. absent pages: holes. bootmem pages: pages used in system boot, managed by bootmem allocator. memmap pages: pages used by page structs. This may cause zone->present_pages less than it should be. For example, numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's present_pages should be spanned pages - absent pages, but now it also minus memmap pages(free_area_init_core), which are actually allocated from ZONE_MOVABLE. When offlining all memory of a zone, this will cause zone->present_pages less than 0, because present_pages is unsigned long type, it is actually a very large integer, it indirectly caused zone->watermark[WMARK_MIN] becomes a large integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a large integer(calculate_totalreserve_pages()), and finally cause memory allocating failure when fork process(__vm_enough_memory()). [root@localhost ~]# dmesg -bash: fork: Cannot allocate memory I think the bug described in http://marc.info/?l=linux-mm&m=134502182714186&w=2 is also caused by wrong zone present pages. This patch intends to fix-up zone->present_pages when memory are freed to buddy system on x86_64 and IA64 platforms. Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reported-by: Petr Tesarik Tested-by: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 1 + include/linux/mm.h | 4 ++++ mm/bootmem.c | 10 +++++++++- mm/memory_hotplug.c | 7 +++++++ mm/nobootmem.c | 3 +++ mm/page_alloc.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 58 insertions(+), 1 deletion(-) (limited to 'arch/ia64') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 082e383c1b6f..acd5b68e8871 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -637,6 +637,7 @@ mem_init (void) high_memory = __va(max_low_pfn * PAGE_SIZE); + reset_zone_present_pages(); for_each_online_pgdat(pgdat) if (pgdat->bdata->node_bootmem_map) totalram_pages += free_all_bootmem_node(pgdat); diff --git a/include/linux/mm.h b/include/linux/mm.h index bcaab4e6fe91..fa0680402738 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1684,5 +1684,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ +extern void reset_zone_present_pages(void); +extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, + unsigned long end_pfn); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..434be4ae7a04 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); + fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), + start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); + fixup_zone_present_pages( + page_to_nid(page), + start + off, start + off + 1); count++; } vec >>= 1; @@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) + while (pages--) { + fixup_zone_present_pages(page_to_nid(page), + page_to_pfn(page), page_to_pfn(page) + 1); __free_pages_bootmem(page++, 0); + } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9ac0955e10a..ce690a911f1b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; + struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); + + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages++; + zone_span_writeunlock(zone); + totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b31411..714d5d650470 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); + fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), + start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; + reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ac593893e6e..00750bc08a3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6087,3 +6087,37 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } + +/* reset zone->present_pages */ +void reset_zone_present_pages(void) +{ + struct zone *z; + int i, nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + z->present_pages = 0; + } + } +} + +/* calculate zone's present pages in buddy system */ +void fixup_zone_present_pages(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct zone *z; + unsigned long zone_start_pfn, zone_end_pfn; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + zone_start_pfn = z->zone_start_pfn; + zone_end_pfn = zone_start_pfn + z->spanned_pages; + + /* if the two regions intersect */ + if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) + z->present_pages += min(end_pfn, zone_end_pfn) - + max(start_pfn, zone_start_pfn); + } +} -- cgit v1.2.3