From 94cb121c9483f1ec9b1ef0c249fbfc49c628fa6b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 7 Aug 2010 03:26:24 +0900 Subject: percpu: add __percpu notations to UP allocator Add __percpu notations to UP percpu allocator. Signed-off-by: Namhyung Kim Signed-off-by: Tejun Heo --- mm/percpu_up.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu_up.c b/mm/percpu_up.c index c4351c7f57d2..db884fae5721 100644 --- a/mm/percpu_up.c +++ b/mm/percpu_up.c @@ -14,13 +14,13 @@ void __percpu *__alloc_percpu(size_t size, size_t align) * percpu sections on SMP for which this path isn't used. */ WARN_ON_ONCE(align > SMP_CACHE_BYTES); - return kzalloc(size, GFP_KERNEL); + return (void __percpu __force *)kzalloc(size, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); void free_percpu(void __percpu *p) { - kfree(p); + kfree(this_cpu_ptr(p)); } EXPORT_SYMBOL_GPL(free_percpu); -- cgit v1.2.3 From 6628bc74f1aa9c35dd386320bf7ec04f12edb1b3 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 27 Aug 2010 09:15:09 +0200 Subject: writeback: do not lose wakeup events when forking bdi threads This patch fixes the following issue: INFO: task mount.nfs4:1120 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mount.nfs4 D 00000000fffc6a21 0 1120 1119 0x00000000 ffff880235643948 0000000000000046 ffffffff00000000 ffffffff00000000 ffff880235643fd8 ffff880235314760 00000000001d44c0 ffff880235643fd8 00000000001d44c0 00000000001d44c0 00000000001d44c0 00000000001d44c0 Call Trace: [] schedule_timeout+0x34/0xf1 [] ? wait_for_common+0x3f/0x130 [] ? trace_hardirqs_on+0xd/0xf [] wait_for_common+0xd2/0x130 [] ? default_wake_function+0x0/0xf [] ? _raw_spin_unlock+0x26/0x2a [] wait_for_completion+0x18/0x1a [] sync_inodes_sb+0xca/0x1bc [] __sync_filesystem+0x47/0x7e [] sync_filesystem+0x47/0x4b [] generic_shutdown_super+0x22/0xd2 [] kill_anon_super+0x11/0x4f [] nfs4_kill_super+0x3f/0x72 [nfs] [] deactivate_locked_super+0x21/0x41 [] deactivate_super+0x40/0x45 [] mntput_no_expire+0xb8/0xed [] release_mounts+0x9a/0xb0 [] put_mnt_ns+0x6a/0x7b [] nfs_follow_remote_path+0x19a/0x296 [nfs] [] nfs4_try_mount+0x75/0xaf [nfs] [] nfs4_get_sb+0x276/0x2ff [nfs] [] vfs_kern_mount+0xb8/0x196 [] do_kern_mount+0x48/0xe8 [] do_mount+0x771/0x7e8 [] sys_mount+0x83/0xbd [] system_call_fastpath+0x16/0x1b The reason of this hang was a race condition: when the flusher thread is forking a bdi thread, we use 'kthread_run()', so we run it _before_ we make it visible in 'bdi->wb.task'. The bdi thread runs, does all works, and goes sleep. 'bdi->wb.task' is still NULL. And this is a dangerous time window. If at this time someone queues a work for this bdi, he does not see the bdi thread and wakes up the forker thread instead! But the forker has already forked this bdi thread, but just did not make it visible yet! The result is that we lose the wake up event for this bdi thread and the NFS4 code waits forever. To fix the problem, we should use 'ktrhead_create()' for creating bdi threads, then make them visible in 'bdi->wb.task', and only after this wake them up. This is exactly what this patch does. Signed-off-by: Artem Bityutskiy Signed-off-by: Jens Axboe --- mm/backing-dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eaa4a5bbe063..c2bf86f470ed 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -445,8 +445,8 @@ static int bdi_forker_thread(void *ptr) switch (action) { case FORK_THREAD: __set_current_state(TASK_RUNNING); - task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", - dev_name(bdi->dev)); + task = kthread_create(bdi_writeback_thread, &bdi->wb, + "flush-%s", dev_name(bdi->dev)); if (IS_ERR(task)) { /* * If thread creation fails, force writeout of @@ -457,10 +457,13 @@ static int bdi_forker_thread(void *ptr) /* * The spinlock makes sure we do not lose * wake-ups when racing with 'bdi_queue_work()'. + * And as soon as the bdi thread is visible, we + * can start it. */ spin_lock_bh(&bdi->wb_lock); bdi->wb.task = task; spin_unlock_bh(&bdi->wb_lock); + wake_up_process(task); } break; -- cgit v1.2.3 From a002d148426f40bc2b7dc066982eb177cdebeaaa Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Sun, 8 Aug 2010 14:39:07 +0200 Subject: percpu: fix a memory leak in pcpu_extend_area_map() The original code did not free the old map. This patch fixes it. tj: use @old as memcpy source instead of @chunk->map, and indentation and description update Signed-off-by: Huang Shijie Signed-off-by: Tejun Heo Cc: stable@kernel.org --- mm/percpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index e61dc2cc5873..a1830d8e3318 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -393,7 +393,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) goto out_unlock; old_size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, old_size); + old = chunk->map; + + memcpy(new, old, old_size); chunk->map_alloc = new_alloc; chunk->map = new; -- cgit v1.2.3 From 54157c44471f5e266508ac08d270f2bc5857e8bb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 11 Aug 2010 11:19:19 +0900 Subject: percpu: fix a mismatch between code and comment When pcpu_build_alloc_info() searches best_upa value, it ignores current value if the number of waste units exceeds 1/3 of the number of total cpus. But the comment on the code says that it will ignore if wastage is over 25%. Modify the comment. Signed-off-by: Namhyung Kim Signed-off-by: Tejun Heo --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index a1830d8e3318..58c572b18b07 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1164,7 +1164,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( } /* - * Don't accept if wastage is over 25%. The + * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check. */ -- cgit v1.2.3 From 39aa3cb3e8250db9188a6f1e3fb62ffa1a717678 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Tue, 31 Aug 2010 15:52:27 +0200 Subject: mm: Move vma_stack_continue into mm.h So it can be used by all that need to check for that. Signed-off-by: Stefan Bader Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- include/linux/mm.h | 6 ++++++ mm/mlock.c | 6 ------ 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 439fc1f1c1c4..271afc48b9a5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; if (vma->vm_flags & VM_GROWSDOWN) - start += PAGE_SIZE; + if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) + start += PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, diff --git a/include/linux/mm.h b/include/linux/mm.h index e6b1210772ce..74949fbef8c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -864,6 +864,12 @@ int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); +/* Is the vma a continuation of the stack vma above it? */ +static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) +{ + return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); +} + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); diff --git a/mm/mlock.c b/mm/mlock.c index cbae7c5b9568..b70919ce4f72 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,12 +135,6 @@ void munlock_vma_page(struct page *page) } } -/* Is the vma a continuation of the stack vma above it? */ -static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) -{ - return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); -} - static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) { return (vma->vm_flags & VM_GROWSDOWN) && -- cgit v1.2.3 From 4969c1192d15afa3389e7ae3302096ff684ba655 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Sep 2010 16:37:52 -0700 Subject: mm: fix swapin race condition The pte_same check is reliable only if the swap entry remains pinned (by the page lock on swapcache). We've also to ensure the swapcache isn't removed before we take the lock as try_to_free_swap won't care about the page pin. One of the possible impacts of this patch is that a KSM-shared page can point to the anon_vma of another process, which could exit before the page is freed. This can leave a page with a pointer to a recycled anon_vma object, or worse, a pointer to something that is no longer an anon_vma. [riel@redhat.com: changelog help] Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 20 +++++++++----------- mm/ksm.c | 3 --- mm/memory.c | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 43 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 74d691ee9121..3319a6967626 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -16,6 +16,9 @@ struct stable_node; struct mem_cgroup; +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address); + #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); @@ -70,19 +73,14 @@ static inline void set_page_stable_node(struct page *page, * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ -struct page *ksm_does_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address); -static inline struct page *ksm_might_need_to_copy(struct page *page, +static inline int ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { struct anon_vma *anon_vma = page_anon_vma(page); - if (!anon_vma || - (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address))) - return page; - - return ksm_does_need_to_copy(page, vma, address); + return anon_vma && + (anon_vma->root != vma->anon_vma->root || + page->index != linear_page_index(vma, address)); } int page_referenced_ksm(struct page *page, @@ -115,10 +113,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -static inline struct page *ksm_might_need_to_copy(struct page *page, +static inline int ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { - return page; + return 0; } static inline int page_referenced_ksm(struct page *page, diff --git a/mm/ksm.c b/mm/ksm.c index e2ae00458320..b1873cf03ed9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1504,8 +1504,6 @@ struct page *ksm_does_need_to_copy(struct page *page, { struct page *new_page; - unlock_page(page); /* any racers will COW it, not modify it */ - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { copy_user_highpage(new_page, page, address, vma); @@ -1521,7 +1519,6 @@ struct page *ksm_does_need_to_copy(struct page *page, add_page_to_unevictable_list(new_page); } - page_cache_release(page); return new_page; } diff --git a/mm/memory.c b/mm/memory.c index 6b2ab1051851..71b161b73bb5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2623,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page; + struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; struct mem_cgroup *ptr = NULL; @@ -2679,10 +2679,23 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - page = ksm_might_need_to_copy(page, vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; + /* + * Make sure try_to_free_swap didn't release the swapcache + * from under us. The page pin isn't enough to prevent that. + */ + if (unlikely(!PageSwapCache(page))) + goto out_page; + + if (ksm_might_need_to_copy(page, vma, address)) { + swapcache = page; + page = ksm_does_need_to_copy(page, vma, address); + + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + swapcache = NULL; + goto out_page; + } } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { @@ -2735,6 +2748,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); + if (swapcache) { + /* + * Hold the lock to avoid the swap entry to be reused + * until we take the PT lock for the pte_same() check + * (to avoid false positives from pte_same). For + * further safety release the lock after the swap_free + * so that the swap count won't change under a + * parallel locked swapcache. + */ + unlock_page(swapcache); + page_cache_release(swapcache); + } if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); @@ -2756,6 +2781,10 @@ out_page: unlock_page(page); out_release: page_cache_release(page); + if (swapcache) { + unlock_page(swapcache); + page_cache_release(swapcache); + } return ret; } -- cgit v1.2.3 From 152e0659fc001029c70fa4373af1792b1ae0d01c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Sep 2010 16:37:53 -0700 Subject: mm: avoid warning when COMPACTION is selected COMPACTION enables MIGRATION, but MIGRATION spawns a warning if numa or memhotplug aren't selected. However MIGRATION doesn't depend on them. I guess it's just trying to be strict doing a double check on who's enabling it, but it doesn't know that compaction also enables MIGRATION. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index f4e516e9c37c..f0fb9124e410 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -189,7 +189,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in -- cgit v1.2.3 From bc6930457460788e14b2c0808ed4632a1592bd61 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 9 Sep 2010 16:38:00 -0700 Subject: mm: compaction: handle active and inactive fairly in too_many_isolated Iram reported that compaction's too_many_isolated() loops forever. (http://www.spinics.net/lists/linux-mm/msg08123.html) The meminfo when the situation happened was inactive anon is zero. That's because the system has no memory pressure until then. While all anon pages were in the active lru, compaction could select active lru as well as inactive lru. That's a different thing from vmscan's isolated. So we has been two too_many_isolated. While compaction can isolate pages in both active and inactive, current implementation of too_many_isolated only considers inactive. It made Iram's problem. This patch handles active and inactive fairly. That's because we can't expect where from and how many compaction would isolated pages. This patch changes (nr_isolated > nr_inactive) with nr_isolated > (nr_active + nr_inactive) / 2. Signed-off-by: Minchan Kim Reported-by: Iram Shahzad Acked-by: Mel Gorman Acked-by: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 94cce51b0b35..4d709ee59013 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { - - unsigned long inactive, isolated; + unsigned long active, inactive, isolated; inactive = zone_page_state(zone, NR_INACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_ANON); + active = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_ANON); isolated = zone_page_state(zone, NR_ISOLATED_FILE) + zone_page_state(zone, NR_ISOLATED_ANON); - return isolated > inactive; + return isolated > (inactive + active) / 2; } /* -- cgit v1.2.3 From 0dcc48c15f63ee86c2fcd33968b08d651f0360a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 9 Sep 2010 16:38:01 -0700 Subject: memory hotplug: fix next block calculation in is_removable next_active_pageblock() is for finding next _used_ freeblock. It skips several blocks when it finds there are a chunk of free pages lager than pageblock. But it has 2 bugs. 1. We have no lock. page_order(page) - pageblock_order can be minus. 2. pageblocks_stride += is wrong. it should skip page_order(p) of pages. Signed-off-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Wu Fengguang Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a4cfcdc00455..dd186c1a5d53 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -584,19 +584,19 @@ static inline int pageblock_free(struct page *page) /* Return the start of the next active pageblock after a given page */ static struct page *next_active_pageblock(struct page *page) { - int pageblocks_stride; - /* Ensure the starting page is pageblock-aligned */ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); - /* Move forward by at least 1 * pageblock_nr_pages */ - pageblocks_stride = 1; - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) - pageblocks_stride += page_order(page) - pageblock_order; + if (pageblock_free(page)) { + int order; + /* be careful. we don't have locks, page_order can be changed.*/ + order = page_order(page); + if ((order < MAX_ORDER) && (order >= pageblock_order)) + return page + (1 << order); + } - return page + (pageblocks_stride * pageblock_nr_pages); + return page + pageblock_nr_pages; } /* Checks if this range of memory is likely to be hot-removable. */ -- cgit v1.2.3 From ac8456d6f9a3011c824176bd6084d39e5f70a382 Mon Sep 17 00:00:00 2001 From: Gary King Date: Thu, 9 Sep 2010 16:38:05 -0700 Subject: bounce: call flush_dcache_page() after bounce_copy_vec() I have been seeing problems on Tegra 2 (ARMv7 SMP) systems with HIGHMEM enabled on 2.6.35 (plus some patches targetted at 2.6.36 to perform cache maintenance lazily), and the root cause appears to be that the mm bouncing code is calling flush_dcache_page before it copies the bounce buffer into the bio. The bounced page needs to be flushed after data is copied into it, to ensure that architecture implementations can synchronize instruction and data caches if necessary. Signed-off-by: Gary King Cc: Tejun Heo Cc: Russell King Acked-by: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bounce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bounce.c b/mm/bounce.c index 13b6dad1eed2..1481de68184b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) */ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - flush_dcache_page(tovec->bv_page); bounce_copy_vec(tovec, vfrom); + flush_dcache_page(tovec->bv_page); } } -- cgit v1.2.3 From 910321ea817a202ff70fac666e37e2c8e2f88823 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 9 Sep 2010 16:38:07 -0700 Subject: swap: revert special hibernation allocation Please revert 2.6.36-rc commit d2997b1042ec150616c1963b5e5e919ffd0b0ebf "hibernation: freeze swap at hibernation". It complicated matters by adding a second swap allocation path, just for hibernation; without in any way fixing the issue that it was intended to address - page reclaim after fixing the hibernation image might free swap from a page already imaged as swapcache, letting its swap be reallocated to store a different page of the image: resulting in data corruption if the imaged page were freed as clean then swapped back in. Pages freed to si->swap_map were still in danger of being reallocated by the alternative allocation path. I guess it inadvertently fixed slow SSD swap allocation for hibernation, as reported by Nigel Cunningham: by missing out the discards that occur on the usual swap allocation path; but that was unintentional, and needs a separate fix. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: "Rafael J. Wysocki" Cc: Ondrej Zary Cc: Andrea Gelmini Cc: Balbir Singh Cc: Andrea Arcangeli Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 8 +---- kernel/power/hibernate.c | 1 - kernel/power/snapshot.c | 1 - kernel/power/swap.c | 6 ++-- mm/swapfile.c | 94 ++++++++++++------------------------------------ 5 files changed, 26 insertions(+), 84 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fee51a11b73..bf4eb62506db 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -315,6 +315,7 @@ extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page_of_type(int); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); @@ -331,13 +332,6 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -#ifdef CONFIG_HIBERNATION -void hibernation_freeze_swap(void); -void hibernation_thaw_swap(void); -swp_entry_t get_swap_for_hibernation(int type); -void swap_free_for_hibernation(swp_entry_t val); -#endif - /* linux/mm/thrash.c */ extern struct mm_struct *swap_token_mm; extern void grab_swap_token(struct mm_struct *); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c77963938bca..8dc31e02ae12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -338,7 +338,6 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - hibernation_freeze_swap(); saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5e7edfb05e66..f6cd6faf84fd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1086,7 +1086,6 @@ void swsusp_free(void) buffer = NULL; alloc_normal = 0; alloc_highmem = 0; - hibernation_thaw_swap(); } /* Helper functions used for the shrinking of memory. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5d0059eed3e4..e6a5bdf61a37 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap) { unsigned long offset; - offset = swp_offset(get_swap_for_hibernation(swap)); + offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free_for_hibernation(swp_entry(swap, offset)); + swap_free(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -163,7 +163,7 @@ void free_all_swap_pages(int swap) ext = container_of(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) - swap_free_for_hibernation(swp_entry(swap, offset)); + swap_free(swp_entry(swap, offset)); kfree(ext); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f3f9c59a73a..f08d165871b3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,8 +47,6 @@ long nr_swap_pages; long total_swap_pages; static int least_priority; -static bool swap_for_hibernation; - static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; @@ -453,8 +451,6 @@ swp_entry_t get_swap_page(void) spin_lock(&swap_lock); if (nr_swap_pages <= 0) goto noswap; - if (swap_for_hibernation) - goto noswap; nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { @@ -487,6 +483,28 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { + nr_swap_pages--; + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, 1); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + static struct swap_info_struct *swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; @@ -746,74 +764,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) #endif #ifdef CONFIG_HIBERNATION - -static pgoff_t hibernation_offset[MAX_SWAPFILES]; -/* - * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise, - * saved swap_map[] image to the disk will be an incomplete because it's - * changing without synchronization with hibernation snap shot. - * At resume, we just make swap_for_hibernation=false. We can forget - * used maps easily. - */ -void hibernation_freeze_swap(void) -{ - int i; - - spin_lock(&swap_lock); - - printk(KERN_INFO "PM: Freeze Swap\n"); - swap_for_hibernation = true; - for (i = 0; i < MAX_SWAPFILES; i++) - hibernation_offset[i] = 1; - spin_unlock(&swap_lock); -} - -void hibernation_thaw_swap(void) -{ - spin_lock(&swap_lock); - if (swap_for_hibernation) { - printk(KERN_INFO "PM: Thaw Swap\n"); - swap_for_hibernation = false; - } - spin_unlock(&swap_lock); -} - -/* - * Because updateing swap_map[] can make not-saved-status-change, - * we use our own easy allocator. - * Please see kernel/power/swap.c, Used swaps are recorded into - * RB-tree. - */ -swp_entry_t get_swap_for_hibernation(int type) -{ - pgoff_t off; - swp_entry_t val = {0}; - struct swap_info_struct *si; - - spin_lock(&swap_lock); - - si = swap_info[type]; - if (!si || !(si->flags & SWP_WRITEOK)) - goto done; - - for (off = hibernation_offset[type]; off < si->max; ++off) { - if (!si->swap_map[off]) - break; - } - if (off < si->max) { - val = swp_entry(type, off); - hibernation_offset[type] = off + 1; - } -done: - spin_unlock(&swap_lock); - return val; -} - -void swap_free_for_hibernation(swp_entry_t ent) -{ - /* Nothing to do */ -} - /* * Find the swap type that corresponds to given device (if any). * -- cgit v1.2.3 From b73d7fcecd93dc15eaa3c45c8c587b613f6673c4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 9 Sep 2010 16:38:09 -0700 Subject: swap: prevent reuse during hibernation Move the hibernation check from scan_swap_map() into try_to_free_swap(): to catch not only the common case when hibernation's allocation itself triggers swap reuse, but also the less likely case when concurrent page reclaim (shrink_page_list) might happen to try_to_free_swap from a page. Hibernation already clears __GFP_IO from the gfp_allowed_mask, to stop reclaim from going to swap: check that to prevent swap reuse too. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: "Rafael J. Wysocki" Cc: Ondrej Zary Cc: Andrea Gelmini Cc: Balbir Singh Cc: Andrea Arcangeli Cc: Nigel Cunningham Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index f08d165871b3..ed5151079f59 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -318,10 +318,8 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; - /* reuse swap entry of cache-only swap if not hibernation. */ - if (vm_swap_full() - && usage == SWAP_HAS_CACHE - && si->swap_map[offset] == SWAP_HAS_CACHE) { + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&swap_lock); swap_was_freed = __try_to_reclaim_swap(si, offset); @@ -688,6 +686,24 @@ int try_to_free_swap(struct page *page) if (page_swapcount(page)) return 0; + /* + * Once hibernation has begun to create its image of memory, + * there's a danger that one of the calls to try_to_free_swap() + * - most probably a call from __try_to_reclaim_swap() while + * hibernation is allocating its own swap pages for the image, + * but conceivably even a call from memory reclaim - will free + * the swap from a page which has already been recorded in the + * image as a clean swapcache page, and then reuse its swap for + * another page of the image. On waking from hibernation, the + * original page might be freed under memory pressure, then + * later read back in from swap, now with the wrong data. + * + * Hibernation clears bits from gfp_allowed_mask to prevent + * memory reclaim from writing to disk, so check that here. + */ + if (!(gfp_allowed_mask & __GFP_IO)) + return 0; + delete_from_swap_cache(page); SetPageDirty(page); return 1; -- cgit v1.2.3 From 8f2ae0faa3a119158c4dcfe89926d6fad5f5332c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Sep 2010 16:38:10 -0700 Subject: swap: do not send discards as barriers The swap code already uses synchronous discards, no need to add I/O barriers. This fixes the worst of the terrible slowdown in swap allocation for hibernation, reported on 2.6.35 by Nigel Cunningham; but does not entirely eliminate that regression. [tj@kernel.org: superflous newlines removed] Signed-off-by: Christoph Hellwig Tested-by: Nigel Cunningham Signed-off-by: Tejun Heo Signed-off-by: Hugh Dickins Cc: Jens Axboe Cc: James Bottomley Cc: "Martin K. Petersen" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index ed5151079f59..1894dead0b58 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -139,8 +139,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); if (err) return err; cond_resched(); @@ -151,8 +150,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); if (err) break; @@ -191,8 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER)) + nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) break; } -- cgit v1.2.3 From 3399446632739fcd05fd8b272b476a69c6e6d14a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 9 Sep 2010 16:38:11 -0700 Subject: swap: discard while swapping only if SWAP_FLAG_DISCARD Tests with recent firmware on Intel X25-M 80GB and OCZ Vertex 60GB SSDs show a shift since I last tested in December: in part because of firmware updates, in part because of the necessary move from barriers to awaiting completion at the block layer. While discard at swapon still shows as slightly beneficial on both, discarding 1MB swap cluster when allocating is now disadvanteous: adds 25% overhead on Intel, adds 230% on OCZ (YMMV). Surrender: discard as presently implemented is more hindrance than help for swap; but might prove useful on other devices, or with improvements. So continue to do the discard at swapon, but make discard while swapping conditional on a SWAP_FLAG_DISCARD to sys_swapon() (which has been using only the lower 16 bits of int flags). We can add a --discard or -d to swapon(8), and a "discard" to swap in /etc/fstab: matching the mount option for btrfs, ext4, fat, gfs2, nilfs2. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Nigel Cunningham Cc: Tejun Heo Cc: Jens Axboe Cc: James Bottomley Cc: "Martin K. Petersen" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- mm/swapfile.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index bf4eb62506db..7cdd63366f88 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -19,6 +19,7 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 +#define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ static inline int current_is_kswapd(void) { @@ -142,7 +143,7 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ + SWP_DISCARDABLE = (1 << 2), /* swapon+blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 1894dead0b58..7c703ff2f36f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2047,7 +2047,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (random32() % p->highest_bit); } - if (discard_swap(p) == 0) + if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) p->flags |= SWP_DISCARDABLE; } -- cgit v1.2.3 From 5ee28a447625b9fe64fbf7cff026561084fc5f16 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 9 Sep 2010 16:38:14 -0700 Subject: vmstat: update zone stat threshold when onlining a cpu refresh_zone_stat_thresholds() calculates parameter based on the number of online cpus. It's called at cpu offlining but needs to be called at onlining, too. Signed-off-by: KAMEZAWA Hiroyuki Cc: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index f389168f9a83..a8d6b59e609a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -998,6 +998,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + refresh_zone_stat_thresholds(); start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); break; -- cgit v1.2.3 From 72853e2991a2702ae93aaf889ac7db743a415dd3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Sep 2010 16:38:16 -0700 Subject: mm: page allocator: update free page counters after pages are placed on the free list When allocating a page, the system uses NR_FREE_PAGES counters to determine if watermarks would remain intact after the allocation was made. This check is made without interrupts disabled or the zone lock held and so is race-prone by nature. Unfortunately, when pages are being freed in batch, the counters are updated before the pages are added on the list. During this window, the counters are misleading as the pages do not exist yet. When under significant pressure on systems with large numbers of CPUs, it's possible for processes to make progress even though they should have been stalled. This is particularly problematic if a number of the processes are using GFP_ATOMIC as the min watermark can be accidentally breached and in extreme cases, the system can livelock. This patch updates the counters after the pages have been added to the list. This makes the allocator more cautious with respect to preserving the watermarks and mitigates livelock possibilities. [akpm@linux-foundation.org: avoid modifying incoming args] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9649f4b261e..452e2ba06c7c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -588,13 +588,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int to_free = count; spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count); - while (count) { + while (to_free) { struct page *page; struct list_head *list; @@ -619,8 +619,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, page_private(page)); trace_mm_page_pcpu_drain(page, 0, page_private(page)); - } while (--count && --batch_free && !list_empty(list)); + } while (--to_free && --batch_free && !list_empty(list)); } + __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -631,8 +632,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } -- cgit v1.2.3 From aa45484031ddee09b06350ab8528bfe5b2c76d1c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 9 Sep 2010 16:38:17 -0700 Subject: mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is low and kswapd is awake Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is cheaper than scanning a number of lists. To avoid synchronization overhead, counter deltas are maintained on a per-cpu basis and drained both periodically and when the delta is above a threshold. On large CPU systems, the difference between the estimated and real value of NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than number of real free page in buddy, the VM can allocate pages below min watermark, at worst reducing the real number of pages to zero. Even if the OOM killer kills some victim for freeing memory, it may not free memory if the exit path requires a new page resulting in livelock. This patch introduces a zone_page_state_snapshot() function (courtesy of Christoph) that takes a slightly more accurate view of an arbitrary vmstat counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid the watermark being accidentally broken. The estimate is not perfect and may result in cache line bounces but is expected to be lighter than the IPI calls necessary to continually drain the per-cpu counters while kswapd is awake. Signed-off-by: Christoph Lameter Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 13 +++++++++++++ include/linux/vmstat.h | 22 ++++++++++++++++++++++ mm/mmzone.c | 21 +++++++++++++++++++++ mm/page_alloc.c | 4 ++-- mm/vmstat.c | 15 ++++++++++++++- 5 files changed, 72 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6e6e62648a4d..3984c4eb41fd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -283,6 +283,13 @@ struct zone { /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; + /* + * When free pages are below this point, additional steps are taken + * when reading the number of free pages to avoid per-cpu counter + * drift allowing watermarks to be breached + */ + unsigned long percpu_drift_mark; + /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several @@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } +#ifdef CONFIG_SMP +unsigned long zone_nr_free_pages(struct zone *zone); +#else +#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) +#endif /* CONFIG_SMP */ + /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7f43ccdc1d38..eaaea37b3b75 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone, return x; } +/* + * More accurate version that also considers the currently pending + * deltas. For that we need to loop over all cpus to find the current + * deltas. There is no synchronization so the result cannot be + * exactly accurate either. + */ +static inline unsigned long zone_page_state_snapshot(struct zone *zone, + enum zone_stat_item item) +{ + long x = atomic_long_read(&zone->vm_stat[item]); + +#ifdef CONFIG_SMP + int cpu; + for_each_online_cpu(cpu) + x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; + + if (x < 0) + x = 0; +#endif + return x; +} + extern unsigned long global_reclaimable_pages(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d1760213..e35bfb82c855 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +#ifdef CONFIG_SMP +/* Called when a more accurate view of NR_FREE_PAGES is needed */ +unsigned long zone_nr_free_pages(struct zone *zone) +{ + unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); + + /* + * While kswapd is awake, it is considered the zone is under some + * memory pressure. Under pressure, there is a risk that + * per-cpu-counter-drift will allow the min watermark to be breached + * potentially causing a live-lock. While kswapd is awake and + * free pages are low, get a better estimate for free pages + */ + if (nr_free_pages < zone->percpu_drift_mark && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return zone_page_state_snapshot(zone, NR_FREE_PAGES); + + return nr_free_pages; +} +#endif /* CONFIG_SMP */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 452e2ba06c7c..b2d21e06d45d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1462,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; + long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -2424,7 +2424,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone_nr_free_pages(zone)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), diff --git a/mm/vmstat.c b/mm/vmstat.c index a8d6b59e609a..355a9e669aaa 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void) int threshold; for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + threshold = calculate_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; } } @@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_page_state(zone, NR_FREE_PAGES), + zone_nr_free_pages(zone), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), -- cgit v1.2.3 From 9ee493ce0a60bf42c0f8fd0b0fe91df5704a1cbf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Sep 2010 16:38:18 -0700 Subject: mm: page allocator: drain per-cpu lists after direct reclaim allocation fails When under significant memory pressure, a process enters direct reclaim and immediately afterwards tries to allocate a page. If it fails and no further progress is made, it's possible the system will go OOM. However, on systems with large amounts of memory, it's possible that a significant number of pages are on per-cpu lists and inaccessible to the calling process. This leads to a process entering direct reclaim more often than it should increasing the pressure on the system and compounding the problem. This patch notes that if direct reclaim is making progress but allocations are still failing that the system is already under heavy pressure. In this case, it drains the per-cpu lists and tries the allocation a second time before continuing. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Cc: Dave Chinner Cc: Wu Fengguang Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b2d21e06d45d..a8cfa9cc6e86 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1847,6 +1847,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; struct reclaim_state reclaim_state; struct task_struct *p = current; + bool drained = false; cond_resched(); @@ -1865,14 +1866,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); - if (order != 0) - drain_all_pages(); + if (unlikely(!(*did_some_progress))) + return NULL; - if (likely(*did_some_progress)) - page = get_page_from_freelist(gfp_mask, nodemask, order, +retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists. Drain them and try again + */ + if (!page && !drained) { + drain_all_pages(); + drained = true; + goto retry; + } + return page; } -- cgit v1.2.3 From 31c4a3d3a0f84a5847665f8aa0552d188389f791 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 19 Sep 2010 19:40:22 -0700 Subject: mm: further fix swapin race condition Commit 4969c1192d15 ("mm: fix swapin race condition") is now agreed to be incomplete. There's a race, not very much less likely than the original race envisaged, in which it is further necessary to check that the swapcache page's swap has not changed. Here's the reasoning: cast in terms of reuse_swap_page(), but probably could be reformulated to rely on try_to_free_swap() instead, or on swapoff+swapon. A, faults into do_swap_page(): does page1 = lookup_swap_cache(swap1) and comes through the lock_page(page1). B, a racing thread of the same process, faults on the same address: does page1 = lookup_swap_cache(swap1) and now waits in lock_page(page1), but for whatever reason is unlucky not to get the lock any time soon. A carries on through do_swap_page(), a write fault, but cannot reuse the swap page1 (another reference to swap1). Unlocks the page1 (but B doesn't get it yet), does COW in do_wp_page(), page2 now in that pte. C, perhaps the parent of A+B, comes in and write faults the same swap page1 into its mm, reuse_swap_page() succeeds this time, swap1 is freed. kswapd comes in after some time (B still unlucky) and swaps out some pages from A+B and C: it allocates the original swap1 to page2 in A+B, and some other swap2 to the original page1 now in C. But does not immediately free page1 (actually it couldn't: B holds a reference), leaving it in swap cache for now. B at last gets the lock on page1, hooray! Is PageSwapCache(page1)? Yes. Is pte_same(*page_table, orig_pte)? Yes, because page2 has now been given the swap1 which page1 used to have. So B proceeds to insert page1 into A+B's page_table, though its content now belongs to C, quite different from what A wrote there. B ought to have checked that page1's swap was still swap1. Signed-off-by: Hugh Dickins Reviewed-by: Rik van Riel Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 71b161b73bb5..0e18b4d649ec 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2680,10 +2680,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_clear_flag(DELAYACCT_PF_SWAPIN); /* - * Make sure try_to_free_swap didn't release the swapcache - * from under us. The page pin isn't enough to prevent that. + * Make sure try_to_free_swap or reuse_swap_page or swapoff did not + * release the swapcache from under us. The page pin, and pte_same + * test below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not changed. */ - if (unlikely(!PageSwapCache(page))) + if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; if (ksm_might_need_to_copy(page, vma, address)) { -- cgit v1.2.3 From 46b30ea9bc3698bc1d1e6fd726c9601d46fa0a91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Sep 2010 07:57:19 +0200 Subject: percpu: fix pcpu_last_unit_cpu pcpu_first/last_unit_cpu are used to track which cpu has the first and last units assigned. This in turn is used to determine the span of a chunk for man/unmap cache flushes and whether an address belongs to the first chunk or not in per_cpu_ptr_to_phys(). When the number of possible CPUs isn't power of two, a chunk may contain unassigned units towards the end of a chunk. The logic to determine pcpu_last_unit_cpu was incorrect when there was an unused unit at the end of a chunk. It failed to ignore the unused unit and assigned the unused marker NR_CPUS to pcpu_last_unit_cpu. This was discovered through kdump failure which was caused by malfunctioning per_cpu_ptr_to_phys() on a kvm setup with 50 possible CPUs by CAI Qian. Signed-off-by: Tejun Heo Reported-by: CAI Qian Cc: stable@kernel.org --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 58c572b18b07..c76ef3891e0d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1401,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; + pcpu_last_unit_cpu = cpu; } } - pcpu_last_unit_cpu = cpu; pcpu_nr_units = unit; for_each_possible_cpu(cpu) -- cgit v1.2.3 From 976e48f8a5b02fc33f3e5cad87fb3fcea041a49c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Sep 2010 11:48:55 +0200 Subject: bdi: Initialize noop_backing_dev_info properly Properly initialize this backing dev info so that writeback code does not barf when getting to it e.g. via sb->s_bdi. Cc: stable@kernel.org Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- mm/backing-dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c2bf86f470ed..65d420499a61 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info); struct backing_dev_info noop_backing_dev_info = { .name = "noop", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); @@ -243,6 +244,7 @@ static int __init default_bdi_init(void) err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); + err = bdi_init(&noop_backing_dev_info); return err; } -- cgit v1.2.3 From f19e8aa11afa24036c6273428da51949b5acf30c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Sep 2010 13:04:52 -0700 Subject: oom: always return a badness score of non-zero for eligible tasks A task's badness score is roughly a proportion of its rss and swap compared to the system's capacity. The scale ranges from 0 to 1000 with the highest score chosen for kill. Thus, this scale operates on a resolution of 0.1% of RAM + swap. Admin tasks are also given a 3% bonus, so the badness score of an admin task using 3% of memory, for example, would still be 0. It's possible that an exceptionally large number of tasks will combine to exhaust all resources but never have a single task that uses more than 0.1% of RAM and swap (or 3.0% for admin tasks). This patch ensures that the badness score of any eligible task is never 0 so the machine doesn't unnecessarily panic because it cannot find a task to kill. Signed-off-by: David Rientjes Cc: Dave Hansen Cc: Nitin Gupta Cc: Pekka Enberg Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fc81cb22869e..859250c7dc06 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, */ points += p->signal->oom_score_adj; - if (points < 0) - return 0; + /* + * Never return 0 for an eligible task that may be killed since it's + * possible that no single user task uses more than 0.1% of memory and + * no single admin tasks uses more than 3.0%. + */ + if (points <= 0) + return 1; return (points < 1000) ? points : 1000; } -- cgit v1.2.3 From d1908362ae0b97374eb8328fbb471576332f9fb1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 22 Sep 2010 13:05:01 -0700 Subject: vmscan: check all_unreclaimable in direct reclaim path M. Vefa Bicakci reported 2.6.35 kernel hang up when hibernation on his 32bit 3GB mem machine. (https://bugzilla.kernel.org/show_bug.cgi?id=16771). Also he bisected the regression to commit bb21c7ce18eff8e6e7877ca1d06c6db719376e3c Author: KOSAKI Motohiro Date: Fri Jun 4 14:15:05 2010 -0700 vmscan: fix do_try_to_free_pages() return value when priority==0 reclaim failure At first impression, this seemed very strange because the above commit only chenged function return value and hibernate_preallocate_memory() ignore return value of shrink_all_memory(). But it's related. Now, page allocation from hibernation code may enter infinite loop if the system has highmem. The reasons are that vmscan don't care enough OOM case when oom_killer_disabled. The problem sequence is following as. 1. hibernation 2. oom_disable 3. alloc_pages 4. do_try_to_free_pages if (scanning_global_lru(sc) && !all_unreclaimable) return 1; If kswapd is not freozen, it would set zone->all_unreclaimable to 1 and then shrink_zones maybe return true(ie, all_unreclaimable is true). So at last, alloc_pages could go to _nopage_. If it is, it should have no problem. This patch adds all_unreclaimable check to protect in direct reclaim path, too. It can care of hibernation OOM case and help bailout all_unreclaimable case slightly. Signed-off-by: KOSAKI Motohiro Signed-off-by: Minchan Kim Reported-by: M. Vefa Bicakci Reported-by: Reviewed-by: Johannes Weiner Tested-by: Acked-by: Rafael J. Wysocki Acked-by: Rik van Riel Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c391c320dbaf..c5dfabf25f11 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static bool shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, } shrink_zone(priority, zone, sc); - all_unreclaimable = false; } +} + +static bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + +/* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. It can't handle OOM during hibernation. + * So let's check zone's unreclaimable in direct reclaim as well as kswapd. + */ +static bool all_unreclaimable(struct zonelist *zonelist, + struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + bool all_unreclaimable = true; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!populated_zone(zone)) + continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + if (zone_reclaimable(zone)) { + all_unreclaimable = false; + break; + } + } + return all_unreclaimable; } @@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - bool all_unreclaimable; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; struct zoneref *z; @@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - all_unreclaimable = shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1931,7 +1959,7 @@ out: return sc->nr_reclaimed; /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable) + if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; @@ -2197,8 +2225,7 @@ loop_again: total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && - zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and -- cgit v1.2.3 From e85bfd3aa7a34fa963bb268a676b41694e6dcf96 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Sep 2010 13:05:10 -0700 Subject: oom: filter unkillable tasks from tasklist dump /proc/sys/vm/oom_dump_tasks is enabled by default, so it's necessary to limit as much information as possible that it should emit. The tasklist dump should be filtered to only those tasks that are eligible for oom kill. This is already done for memcg ooms, but this patch extends it to both cpuset and mempolicy ooms as well as init. In addition to suppressing irrelevant information, this also reduces confusion since users currently don't know which tasks in the tasklist aren't eligible for kill (such as those attached to cpusets or bound to mempolicies with a disjoint set of mems or nodes, respectively) since that information is not shown. Signed-off-by: David Rientjes Reviewed-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 859250c7dc06..4029583a1024 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) } /* return true if the task is not adequate as candidate victim task. */ -static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem, - const nodemask_t *nodemask) +static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *mem, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -344,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, /** * dump_tasks - dump current memory state of all system tasks * @mem: current's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * - * Dumps the current memory state of all system tasks, excluding kernel threads. + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj * value, oom_score_adj value, and name. * - * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are - * shown. - * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem) +static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); for_each_process(p) { - if (p->flags & PF_KTHREAD) - continue; - if (mem && !task_in_mem_cgroup(p, mem)) + if (oom_unkillable_task(p, mem, nodemask)) continue; task = find_lock_task_mm(p); @@ -386,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *mem) + struct mem_cgroup *mem, const nodemask_t *nodemask) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " @@ -399,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, mem_cgroup_print_oom_info(mem, p); show_mem(); if (sysctl_oom_dump_tasks) - dump_tasks(mem); + dump_tasks(mem, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -441,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int victim_points = 0; if (printk_ratelimit()) - dump_header(p, gfp_mask, order, mem); + dump_header(p, gfp_mask, order, mem, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -487,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order) + int order, const nodemask_t *nodemask) { if (likely(!sysctl_panic_on_oom)) return; @@ -501,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, return; } read_lock(&tasklist_lock); - dump_header(NULL, gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL, nodemask); read_unlock(&tasklist_lock); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); @@ -514,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned int points = 0; struct task_struct *p; - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: @@ -646,6 +644,7 @@ static void clear_system_oom(void) void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask) { + const nodemask_t *mpol_mask; struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; @@ -675,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ constraint = constrained_alloc(zonelist, gfp_mask, nodemask, &totalpages); - check_panic_on_oom(constraint, gfp_mask, order); + mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && @@ -693,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, } retry: - p = select_bad_process(&points, totalpages, NULL, - constraint == CONSTRAINT_MEMORY_POLICY ? nodemask : - NULL); + p = select_bad_process(&points, totalpages, NULL, mpol_mask); if (PTR_ERR(p) == -1UL) goto out; /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { - dump_header(NULL, gfp_mask, order, NULL); + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); read_unlock(&tasklist_lock); panic("Out of memory and no killable processes...\n"); } -- cgit v1.2.3 From 2aeadc30de45a72648f271603203ab392b80f607 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Sep 2010 13:05:12 -0700 Subject: mmap: call unlink_anon_vmas() in __split_vma() in case of error If __split_vma fails because of an out of memory condition the anon_vma_chain isn't teardown and freed potentially leading to rmap walks accessing freed vma information plus there's a memleak. Signed-off-by: Andrea Arcangeli Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Hugh Dickins Cc: Marcelo Tosatti Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 6128dc8e5ede..00161a48a451 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2009,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, removed_exe_file_vma(mm); fput(new->vm_file); } + unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); out_free_vma: -- cgit v1.2.3 From 433abed6c6f76ca079a9564f2a1a51fd28ebe0ca Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 10 Sep 2010 13:23:03 +0900 Subject: hugetlb, rmap: always use anon_vma root pointer This patch applies Andrea's fix given by the following patch into hugepage rmapping code: commit 288468c334e98aacbb7e2fb8bde6bc1adcd55e05 Author: Andrea Arcangeli Date: Mon Aug 9 17:19:09 2010 -0700 This patch uses anon_vma->root and avoids unnecessary overwriting when anon_vma is already set up. Signed-off-by: Naoya Horiguchi Acked-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/rmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index f6f0d2dda2ea..2854857fd63b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1564,13 +1564,14 @@ static void __hugepage_set_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { struct anon_vma *anon_vma = vma->anon_vma; + BUG_ON(!anon_vma); - if (!exclusive) { - struct anon_vma_chain *avc; - avc = list_entry(vma->anon_vma_chain.prev, - struct anon_vma_chain, same_vma); - anon_vma = avc->anon_vma; - } + + if (PageAnon(page)) + return; + if (!exclusive) + anon_vma = anon_vma->root; + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; page->index = linear_page_index(vma, address); -- cgit v1.2.3 From cd67f0d2a9a6b5b9f79f4343dc8805757d9ebae2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 10 Sep 2010 13:23:04 +0900 Subject: hugetlb, rmap: use hugepage_add_new_anon_rmap() in hugetlb_cow() Obviously, setting anon_vma for COWed hugepage should be done by hugepage_add_new_anon_rmap() to scan vmas faster. This patch fixes it. Signed-off-by: Naoya Horiguchi Acked-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cc5be788a39f..9519f3f6c1da 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2404,7 +2404,7 @@ retry_avoidcopy: set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page); - hugepage_add_anon_rmap(new_page, vma, address); + hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; mmu_notifier_invalidate_range_end(mm, -- cgit v1.2.3 From 56c9cfb13c9b6516017eea4e8cbe22ea02e07ee6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 10 Sep 2010 13:23:04 +0900 Subject: hugetlb, rmap: fix confusing page locking in hugetlb_cow() The "if (!trylock_page)" block in the avoidcopy path of hugetlb_cow() looks confusing and is buggy. Originally this trylock_page() was intended to make sure that old_page is locked even when old_page != pagecache_page, because then only pagecache_page is locked. This patch fixes it by moving page locking into hugetlb_fault(). Signed-off-by: Naoya Horiguchi Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9519f3f6c1da..c03273807182 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2324,11 +2324,8 @@ retry_avoidcopy: * and just make the page writable */ avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { - if (!trylock_page(old_page)) { - if (PageAnon(old_page)) - page_move_anon_rmap(old_page, vma, address); - } else - unlock_page(old_page); + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } - if (!pagecache_page) { - page = pte_page(entry); + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + * Note that locking order is always pagecache_page -> page, + * so no worry about deadlock. + */ + page = pte_page(entry); + if (page != pagecache_page) lock_page(page); - } spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ @@ -2661,9 +2664,8 @@ out_page_table_lock: if (pagecache_page) { unlock_page(pagecache_page); put_page(pagecache_page); - } else { - unlock_page(page); } + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); -- cgit v1.2.3 From a850ea30374ebed32a0724742601861853fde869 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 10 Sep 2010 13:23:06 +0900 Subject: hugetlb, rmap: add BUG_ON(!PageLocked) in hugetlb_add_anon_rmap() Confirming page lock is held in hugetlb_add_anon_rmap() may be useful to detect possible future problems. Signed-off-by: Naoya Horiguchi Acked-by: Rik van Riel Acked-by: Andrea Arcangeli Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 2854857fd63b..9d2ba01bd4f9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1582,6 +1582,8 @@ void hugepage_add_anon_rmap(struct page *page, { struct anon_vma *anon_vma = vma->anon_vma; int first; + + BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); BUG_ON(address < vma->vm_start || address >= vma->vm_end); first = atomic_inc_and_test(&page->_mapcount); -- cgit v1.2.3 From e92b05dec8865619ea2608c5c11a54b01467482f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Sep 2010 14:13:57 -0700 Subject: fremap: get rid of broken 'end' variable Thomas Pollet points out that the 'end' variable is broken. It was computed based on start/size before they were page-aligned, and as such doesn't actually match any of the other actions we take. The overflow test on end was also redundant, since we had already tested it with the properly aligned version. So just get rid of it entirely. The one remaining use for that broken variable can just use 'start+size' like all the other cases already did. Reported-by: Thomas Pollet Signed-off-by: Linus Torvalds --- mm/fremap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 46f5dacf90a2..7b7f852848de 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, { struct mm_struct *mm = current->mm; struct address_space *mapping; - unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; @@ -168,7 +167,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!(vma->vm_flags & VM_CAN_NONLINEAR)) goto out; - if (end <= start || start < vma->vm_start || end > vma->vm_end) + if (start < vma->vm_start || start + size > vma->vm_end) goto out; /* Must set VM_NONLINEAR before any pages are populated. */ -- cgit v1.2.3 From 5ec1055aa5632dd7a8283cdb5fa9be3c535eaa06 Mon Sep 17 00:00:00 2001 From: Larry Woodman Date: Fri, 24 Sep 2010 12:04:48 -0400 Subject: Avoid pgoff overflow in remap_file_pages Thomas Pollet noticed that the remap_file_pages() system call in fremap.c has a potential overflow in the first part of the if statement below, which could cause it to process bogus input parameters. Specifically the pgoff + size parameters could be wrap thereby preventing the system call from failing when it should. Reported-by: Thomas Pollet Signed-off-by: Larry Woodman Signed-off-by: Linus Torvalds --- mm/fremap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 7b7f852848de..ec520c7b28df 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -141,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (start + size <= start) return err; + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return err; + /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) -- cgit v1.2.3 From 4829b906cc063cb7cd1b7f34fa05de6db75ec8bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 2 Oct 2010 17:46:06 -0700 Subject: ksm: fix page_address_in_vma anon_vma oops 2.6.36-rc1 commit 21d0d443cdc1658a8c1484fdcece4803f0f96d0e "rmap: resurrect page_address_in_vma anon_vma check" was right to resurrect that check; but now that it's comparing anon_vma->roots instead of just anon_vmas, there's a danger of oopsing on a NULL anon_vma. In most cases no NULL anon_vma ever gets here; but it turns out that occasionally KSM, when enabled on a forked or forking process, will itself call page_address_in_vma() on a "half-KSM" page left over from an earlier failed attempt to merge - whose page_anon_vma() is NULL. It's my bug that those should be getting here at all: I thought they were already dealt with, this oops proves me wrong, I'll fix it in the next release - such pages are effectively pinned until their process exits, since rmap cannot find their ptes (though swapoff can). For now just work around it by making page_address_in_vma() safe (and add a comment on why that check is wanted anyway). A similar check in __page_check_anon_rmap() is safe because do_page_add_anon_rmap() already excluded KSM pages. Signed-off-by: Hugh Dickins Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Rik van Riel Signed-off-by: Linus Torvalds --- mm/rmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 9d2ba01bd4f9..92e6757f196e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -381,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { if (PageAnon(page)) { - if (vma->anon_vma->root != page_anon_vma(page)->root) + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || -- cgit v1.2.3 From 4e31635c367a9e21a43cfbfae4c9deda2e19d1f4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 2 Oct 2010 17:49:08 -0700 Subject: ksm: fix bad user data when swapping Building under memory pressure, with KSM on 2.6.36-rc5, collapsed with an internal compiler error: typically indicating an error in swapping. Perhaps there's a timing issue which makes it now more likely, perhaps it's just a long time since I tried for so long: this bug goes back to KSM swapping in 2.6.33. Notice how reuse_swap_page() allows an exclusive page to be reused, but only does SetPageDirty if it can delete it from swap cache right then - if it's currently under Writeback, it has to be left in cache and we don't SetPageDirty, but the page can be reused. Fine, the dirty bit will get set in the pte; but notice how zap_pte_range() does not bother to transfer pte_dirty to page_dirty when unmapping a PageAnon. If KSM chooses to share such a page, it will look like a clean copy of swapcache, and not be written out to swap when its memory is needed; then stale data read back from swap when it's needed again. We could fix this in reuse_swap_page() (or even refuse to reuse a page under writeback), but it's more honest to fix my oversight in KSM's write_protect_page(). Several days of testing on three machines confirms that this fixes the issue they showed. Signed-off-by: Hugh Dickins Cc: Andrew Morton Cc: Andrea Arcangeli Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/ksm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index b1873cf03ed9..65ab5c7067d9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (!ptep) goto out; - if (pte_write(*ptep)) { + if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; swapped = PageSwapCache(page); @@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at(mm, addr, ptep, entry); goto out_unlock; } - entry = pte_wrprotect(entry); + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, addr, ptep, entry); } *orig_pte = *ptep; -- cgit v1.2.3