diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 85 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 275 | ||||
-rw-r--r-- | mm/bootmem.c | 71 | ||||
-rw-r--r-- | mm/bounce.c | 102 | ||||
-rw-r--r-- | mm/cleancache.c | 267 | ||||
-rw-r--r-- | mm/compaction.c | 133 | ||||
-rw-r--r-- | mm/fadvise.c | 38 | ||||
-rw-r--r-- | mm/filemap.c | 49 | ||||
-rw-r--r-- | mm/filemap_xip.c | 3 | ||||
-rw-r--r-- | mm/fremap.c | 42 | ||||
-rw-r--r-- | mm/frontswap.c | 156 | ||||
-rw-r--r-- | mm/huge_memory.c | 227 | ||||
-rw-r--r-- | mm/hugetlb.c | 344 | ||||
-rw-r--r-- | mm/internal.h | 12 | ||||
-rw-r--r-- | mm/kmemleak.c | 14 | ||||
-rw-r--r-- | mm/ksm.c | 670 | ||||
-rw-r--r-- | mm/madvise.c | 116 | ||||
-rw-r--r-- | mm/memblock.c | 34 | ||||
-rw-r--r-- | mm/memcontrol.c | 1138 | ||||
-rw-r--r-- | mm/memory-failure.c | 224 | ||||
-rw-r--r-- | mm/memory.c | 235 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 622 | ||||
-rw-r--r-- | mm/mempolicy.c | 189 | ||||
-rw-r--r-- | mm/migrate.c | 203 | ||||
-rw-r--r-- | mm/mincore.c | 5 | ||||
-rw-r--r-- | mm/mlock.c | 134 | ||||
-rw-r--r-- | mm/mm_init.c | 78 | ||||
-rw-r--r-- | mm/mmap.c | 370 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 69 | ||||
-rw-r--r-- | mm/mmzone.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 44 | ||||
-rw-r--r-- | mm/nobootmem.c | 54 | ||||
-rw-r--r-- | mm/nommu.c | 134 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 28 | ||||
-rw-r--r-- | mm/page_alloc.c | 674 | ||||
-rw-r--r-- | mm/page_io.c | 88 | ||||
-rw-r--r-- | mm/page_isolation.c | 26 | ||||
-rw-r--r-- | mm/pagewalk.c | 70 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 5 | ||||
-rw-r--r-- | mm/process_vm_access.c | 8 | ||||
-rw-r--r-- | mm/readahead.c | 11 | ||||
-rw-r--r-- | mm/rmap.c | 42 | ||||
-rw-r--r-- | mm/shmem.c | 158 | ||||
-rw-r--r-- | mm/slab.c | 10 | ||||
-rw-r--r-- | mm/slab_common.c | 24 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 13 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 27 | ||||
-rw-r--r-- | mm/sparse.c | 94 | ||||
-rw-r--r-- | mm/swap.c | 127 | ||||
-rw-r--r-- | mm/swap_state.c | 82 | ||||
-rw-r--r-- | mm/swapfile.c | 252 | ||||
-rw-r--r-- | mm/truncate.c | 117 | ||||
-rw-r--r-- | mm/util.c | 27 | ||||
-rw-r--r-- | mm/vmalloc.c | 428 | ||||
-rw-r--r-- | mm/vmpressure.c | 374 | ||||
-rw-r--r-- | mm/vmscan.c | 1051 | ||||
-rw-r--r-- | mm/vmstat.c | 13 | ||||
-rw-r--r-- | mm/zbud.c | 527 | ||||
-rw-r--r-- | mm/zswap.c | 943 |
63 files changed, 7994 insertions, 3397 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 278e3ab1f169..8028dcc6615c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,6 +1,6 @@ config SELECT_MEMORY_MODEL def_bool y - depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL + depends on ARCH_SELECT_MEMORY_MODEL choice prompt "Memory model" @@ -162,12 +162,18 @@ config MOVABLE_NODE Say Y here if you want to hotplug a whole node. Say N here if you want kernel to use memory on all nodes evenly. +# +# Only be set on architectures that have completely implemented memory hotplug +# feature. If you are not sure, don't touch it. +# +config HAVE_BOOTMEM_INFO_NODE + def_bool n + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" - select MEMORY_ISOLATION depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE @@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTREMOVE bool "Allow for memory hot remove" + select MEMORY_ISOLATION + select HAVE_BOOTMEM_INFO_NODE if X86_64 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION @@ -255,8 +263,27 @@ config ZONE_DMA_FLAG default "1" config BOUNCE - def_bool y + bool "Enable bounce buffers" + default y depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + help + Enable bounce buffers for devices that cannot access + the full range of memory available to the CPU. Enabled + by default when ZONE_DMA or HIGHMEM is selected, but you + may say n to override this. + +# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often +# have more than 4GB of memory, but we don't currently use the IOTLB to present +# a 32-bit address to OHCI. So we need to use a bounce pool instead. +# +# We also use the bounce pool to provide stable page writes for jbd. jbd +# initiates buffer writeback without locking the page or setting PG_writeback, +# and fixing that behavior (a second time; jbd2 doesn't have this problem) is +# a major rework effort. Instead, use the bounce buffer to snapshot pages +# (until jbd goes away). The only jbd user is ext3. +config NEED_BOUNCE_POOL + bool + default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) config NR_QUICK int @@ -265,8 +292,12 @@ config NR_QUICK default "1" config VIRT_TO_BUS - def_bool y - depends on !ARCH_NO_VIRT_TO_BUS + bool + help + An architecture should select this if it implements the + deprecated interface virt_to_bus(). All new architectures + should probably not select this. + config MMU_NOTIFIER bool @@ -446,3 +477,45 @@ config FRONTSWAP and swap data is stored as normal on the matching swap device. If unsure, say Y to enable frontswap. + +config ZBUD + tristate + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + +config ZSWAP + bool "Compressed cache for swap pages (EXPERIMENTAL)" + depends on FRONTSWAP && CRYPTO=y + select CRYPTO_LZO + select ZBUD + default n + help + A lightweight compressed cache for swap pages. It takes + pages that are in the process of being swapped out and attempts to + compress them into a dynamically allocated RAM-based memory pool. + This can result in a significant I/O reduction on swap device and, + in the case where decompressing from RAM is faster that swap device + reads, can also improve workload performance. + + This is marked experimental because it is a new feature (as of + v3.11) that interacts heavily with memory reclaim. While these + interactions don't cause any known issues on simple memory setups, + they have not be fully explored on the large set of potential + configurations and workloads that exist. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. diff --git a/mm/Makefile b/mm/Makefile index 3a4628751f89..f00803386a67 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o @@ -50,7 +51,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o @@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZBUD) += zbud.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d3ca2b3ee176..d014ee5fcbbd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as - * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side * locking. */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); -LIST_HEAD(bdi_pending_list); + +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { @@ -221,12 +222,23 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio) +static ssize_t stable_pages_required_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + bdi_cap_stable_pages_required(bdi) ? 1 : 0); +} + #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) static struct device_attribute bdi_dev_attrs[] = { __ATTR_RW(read_ahead_kb), __ATTR_RW(min_ratio), __ATTR_RW(max_ratio), + __ATTR_RO(stable_pages_required), __ATTR_NULL, }; @@ -246,6 +258,11 @@ static int __init default_bdi_init(void) { int err; + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!bdi_wq) + return -ENOMEM; + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -260,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - /* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the @@ -296,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* @@ -478,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); + + /* bdi_list is now unused, clear it to mark @bdi dying */ + INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -497,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -534,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -545,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -586,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -611,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -684,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); @@ -705,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy); int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, unsigned int cap) { - char tmp[32]; int err; bdi->name = name; @@ -714,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, if (err) return err; - sprintf(tmp, "%.28s%s", name, "-%d"); - err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + err = bdi_register(bdi, NULL, "%.28s-%ld", name, + atomic_long_inc_return(&bdi_seq)); if (err) { bdi_destroy(bdi); return err; diff --git a/mm/bootmem.c b/mm/bootmem.c index 1324cd74faec..6ab7744e692e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) while (start < end) { unsigned long *map, idx, vec; + unsigned shift; map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; + shift = idx & (BITS_PER_LONG - 1); + /* + * vec holds at most BITS_PER_LONG map bits, + * bit 0 corresponds to start. + */ vec = ~map[idx / BITS_PER_LONG]; + + if (shift) { + vec >>= shift; + if (end - start >= BITS_PER_LONG) + vec |= ~map[idx / BITS_PER_LONG + 1] << + (BITS_PER_LONG - shift); + } /* * If we have a properly aligned and fully unreserved * BITS_PER_LONG block of pages in front of us, free @@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long off = 0; + unsigned long cur = start; - vec >>= start & (BITS_PER_LONG - 1); - while (vec) { + start = ALIGN(start + 1, BITS_PER_LONG); + while (vec && cur != start) { if (vec & 1) { - page = pfn_to_page(start + off); + page = pfn_to_page(cur); __free_pages_bootmem(page, 0); count++; } vec >>= 1; - off++; + ++cur; } - start = ALIGN(start + 1, BITS_PER_LONG); } } @@ -229,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +void __init reset_all_zones_managed_pages(void) { - register_page_bootmem_info_node(pgdat); - reset_node_lowmem_managed_pages(pgdat); - return free_all_bootmem_core(pgdat->bdata); + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -267,14 +272,14 @@ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; bootmem_data_t *bdata; - struct pglist_data *pgdat; - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); + totalram_pages += total_pages; + return total_pages; } @@ -821,6 +826,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from diff --git a/mm/bounce.c b/mm/bounce.c index 042086775561..c9f0a4339a7d 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) struct bio_vec *tovec, *fromvec; int i; - __bio_for_each_segment(tovec, to, i, 0) { + bio_for_each_segment(tovec, to, i) { fromvec = from->bi_io_vec + i; /* @@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) /* * free up bounce indirect pages used */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { org_vec = bio_orig->bi_io_vec + i; if (bvec->bv_page == org_vec->bv_page) continue; @@ -178,81 +178,64 @@ static void bounce_end_io_read_isa(struct bio *bio, int err) __bounce_end_io_read(bio, isa_page_pool, err); } +#ifdef CONFIG_NEED_BOUNCE_POOL +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + if (bio_data_dir(bio) != WRITE) + return 0; + + if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) + return 0; + + return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); +} +#else +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + return 0; +} +#endif /* CONFIG_NEED_BOUNCE_POOL */ + static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) + mempool_t *pool, int force) { - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); + struct bio *bio; + int rw = bio_data_dir(*bio_orig); struct bio_vec *to, *from; + unsigned i; - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; - - /* - * is destination page below bounce pfn? - */ - if (page_to_pfn(page) <= queue_bounce_pfn(q)) - continue; + bio_for_each_segment(from, *bio_orig, i) + if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) + goto bounce; - /* - * irk, bounce it - */ - if (!bio) { - unsigned int cnt = (*bio_orig)->bi_vcnt; + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - bio = bio_alloc(GFP_NOIO, cnt); - memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); - } - + bio_for_each_segment_all(to, bio, i) { + struct page *page = to->bv_page; - to = bio->bi_io_vec + i; + if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + to->bv_page = mempool_alloc(pool, q->bounce_gfp); if (rw == WRITE) { char *vto, *vfrom; - flush_dcache_page(from->bv_page); + flush_dcache_page(page); + vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); + kunmap_atomic(vfrom); } } - /* - * no pages bounced - */ - if (!bio) - return; - trace_block_bio_bounce(q, *bio_orig); - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; if (pool == page_pool) { bio->bi_end_io = bounce_end_io_write; @@ -270,6 +253,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { + int must_bounce; mempool_t *pool; /* @@ -278,13 +262,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!bio_has_data(*bio_orig)) return; + must_bounce = must_snapshot_stable_pages(q, *bio_orig); + /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) return; pool = page_pool; } else { @@ -295,7 +281,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * slow path */ - __blk_queue_bounce(q, bio_orig, pool); + __blk_queue_bounce(q, bio_orig, pool, must_bounce); } EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/cleancache.c b/mm/cleancache.c index 32e6f4136fa2..5875f48ce279 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,20 +19,10 @@ #include <linux/cleancache.h> /* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/invalidate even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled __read_mostly; -EXPORT_SYMBOL(cleancache_enabled); - -/* * cleancache_ops is set by cleancache_ops_register to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops cleancache_ops __read_mostly; +static struct cleancache_ops *cleancache_ops __read_mostly; /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -45,15 +35,101 @@ static u64 cleancache_puts; static u64 cleancache_invalidates; /* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting + * When no backend is registered all calls to init_fs and init_shared_fs + * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or + * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array + * [shared_|]fs_poolid_map) are given to the respective super block + * (sb->cleancache_poolid) and no tmem_pools are created. When a backend + * registers with cleancache the previous calls to init_fs and init_shared_fs + * are executed to create tmem_pools and set the respective poolids. While no + * backend is registered all "puts", "gets" and "flushes" are ignored or failed. + */ +#define MAX_INITIALIZABLE_FS 32 +#define FAKE_FS_POOLID_OFFSET 1000 +#define FAKE_SHARED_FS_POOLID_OFFSET 2000 + +#define FS_NO_BACKEND (-1) +#define FS_UNKNOWN (-2) +static int fs_poolid_map[MAX_INITIALIZABLE_FS]; +static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; +static char *uuids[MAX_INITIALIZABLE_FS]; +/* + * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads + * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple + * threads calling mount (and ending up in __cleancache_init_[shared|]fs). + */ +static DEFINE_MUTEX(poolid_mutex); +/* + * When set to false (default) all calls to the cleancache functions, except + * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded + * by the if (!cleancache_ops) return. This means multiple threads (from + * different filesystems) will be checking cleancache_ops. The usage of a + * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are + * OK if the time between the backend's have been initialized (and + * cleancache_ops has been set to not NULL) and when the filesystems start + * actually calling the backends. The inverse (when unloading) is obviously + * not good - but this shim does not do that (yet). + */ + +/* + * The backends and filesystems work all asynchronously. This is b/c the + * backends can be built as modules. + * The usual sequence of events is: + * a) mount / -> __cleancache_init_fs is called. We set the + * [shared_|]fs_poolid_map and uuids for. + * + * b). user does I/Os -> we call the rest of __cleancache_* functions + * which return immediately as cleancache_ops is false. + * + * c). modprobe zcache -> cleancache_register_ops. We init the backend + * and set cleancache_ops to true, and for any fs_poolid_map + * (which is set by __cleancache_init_fs) we initialize the poolid. + * + * d). user does I/Os -> now that cleancache_ops is true all the + * __cleancache_* functions can call the backend. They all check + * that fs_poolid_map is valid and if so invoke the backend. + * + * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is + * reset (which is the second check in the __cleancache_* ops + * to call the backend). + * + * The sequence of event could also be c), followed by a), and d). and e). The + * c) would not happen anymore. There is also the chance of c), and one thread + * doing a) + d), and another doing e). For that case we depend on the + * filesystem calling __cleancache_invalidate_fs in the proper sequence (so + * that it handles all I/Os before it invalidates the fs (which is last part + * of unmounting process). + * + * Note: The acute reader will notice that there is no "rmmod zcache" case. + * This is b/c the functionality for that is not yet implemented and when + * done, will require some extra locking not yet devised. + */ + +/* + * Register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting. */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops old = cleancache_ops; + struct cleancache_ops *old = cleancache_ops; + int i; - cleancache_ops = *ops; - cleancache_enabled = 1; + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (fs_poolid_map[i] == FS_NO_BACKEND) + fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); + if (shared_fs_poolid_map[i] == FS_NO_BACKEND) + shared_fs_poolid_map[i] = ops->init_shared_fs + (uuids[i], PAGE_SIZE); + } + /* + * We MUST set cleancache_ops _after_ we have called the backends + * init_fs or init_shared_fs functions. Otherwise the compiler might + * re-order where cleancache_ops is set in this function. + */ + barrier(); + cleancache_ops = ops; + mutex_unlock(&poolid_mutex); return old; } EXPORT_SYMBOL(cleancache_register_ops); @@ -61,15 +137,42 @@ EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); + int i; + + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (fs_poolid_map[i] == FS_UNKNOWN) { + sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; + if (cleancache_ops) + fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); + else + fs_poolid_map[i] = FS_NO_BACKEND; + break; + } + } + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) { - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); + int i; + + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (shared_fs_poolid_map[i] == FS_UNKNOWN) { + sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; + uuids[i] = uuid; + if (cleancache_ops) + shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs + (uuid, PAGE_SIZE); + else + shared_fs_poolid_map[i] = FS_NO_BACKEND; + break; + } + } + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -89,7 +192,7 @@ static int cleancache_get_key(struct inode *inode, fhfn = sb->s_export_op->encode_fh; if (fhfn) { len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); - if (len <= 0 || len == 255) + if (len <= FILEID_ROOT || len == FILEID_INVALID) return -1; if (maxlen > CLEANCACHE_KEY_MAX) return -1; @@ -99,27 +202,53 @@ static int cleancache_get_key(struct inode *inode, } /* + * Returns a pool_id that is associated with a given fake poolid. + */ +static int get_poolid_from_fake(int fake_pool_id) +{ + if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) + return shared_fs_poolid_map[fake_pool_id - + FAKE_SHARED_FS_POOLID_OFFSET]; + else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) + return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; + return FS_NO_BACKEND; +} + +/* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if * successful, use it to fill the specified page with data and return 0. * The pageframe is unchanged and returns -1 if the get fails. * Page must be locked by caller. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; + int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_failed_gets++; + goto out; + } + VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) + fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (fake_pool_id < 0) goto out; + pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (pool_id >= 0) + ret = cleancache_ops->get_page(pool_id, + key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -134,17 +263,32 @@ EXPORT_SYMBOL(__cleancache_get_page); * (previously-obtained per-filesystem) poolid and the page's, * inode and page index. Page must be locked. Note that a put_page * always "succeeds", though a subsequent get_page may succeed or fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_put_page(struct page *page) { int pool_id; + int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_puts++; + return; + } + VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; + fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (fake_pool_id < 0) + return; + + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_get_key(page->mapping->host, &key) >= 0) { + cleancache_ops->put_page(pool_id, key, page->index, page); cleancache_puts++; } } @@ -153,19 +297,31 @@ EXPORT_SYMBOL(__cleancache_put_page); /* * Invalidate any data from cleancache associated with the poolid and the * page's inode and page index so that a subsequent "get" will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id; + int fake_pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; - if (pool_id >= 0) { + if (!cleancache_ops) + return; + + if (fake_pool_id >= 0) { + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id < 0) + return; + VM_BUG_ON(!PageLocked(page)); if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.invalidate_page)(pool_id, - key, page->index); + cleancache_ops->invalidate_page(pool_id, + key, page->index); cleancache_invalidates++; } } @@ -176,34 +332,63 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); * Invalidate all data from cleancache associated with the poolid and the * mappings's inode so that all subsequent gets to this poolid/inode * will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id; + int fake_pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + + if (fake_pool_id < 0) + return; + + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.invalidate_inode)(pool_id, key); + cleancache_ops->invalidate_inode(pool_id, key); } EXPORT_SYMBOL(__cleancache_invalidate_inode); /* * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs. */ void __cleancache_invalidate_fs(struct super_block *sb) { - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.invalidate_fs)(old_poolid); + int index; + int fake_pool_id = sb->cleancache_poolid; + int old_poolid = fake_pool_id; + + mutex_lock(&poolid_mutex); + if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { + index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; + old_poolid = shared_fs_poolid_map[index]; + shared_fs_poolid_map[index] = FS_UNKNOWN; + uuids[index] = NULL; + } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { + index = fake_pool_id - FAKE_FS_POOLID_OFFSET; + old_poolid = fs_poolid_map[index]; + fs_poolid_map[index] = FS_UNKNOWN; } + sb->cleancache_poolid = -1; + if (cleancache_ops) + cleancache_ops->invalidate_fs(old_poolid); + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { + int i; + #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -215,6 +400,10 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + fs_poolid_map[i] = FS_UNKNOWN; + shared_fs_poolid_map[i] = FS_UNKNOWN; + } return 0; } module_init(init_cleancache) diff --git a/mm/compaction.c b/mm/compaction.c index 6b807e466497..05ccb4cc0bdb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -15,6 +15,7 @@ #include <linux/sysctl.h> #include <linux/sysfs.h> #include <linux/balloon_compaction.h> +#include <linux/page-isolation.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void __reset_isolation_suitable(struct zone *zone) { unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; zone->compact_cached_migrate_pfn = start_pfn; @@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page) int migratetype = get_pageblock_migratetype(page); /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + if (migratetype == MIGRATE_RESERVE) + return false; + + if (is_migrate_isolate(migratetype)) return false; /* If the page is a large free page, then allow migration */ @@ -611,8 +615,7 @@ check_compact_cluster: continue; next_pageblock: - low_pfn += pageblock_nr_pages; - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; last_pageblock_nr = pageblock_nr; } @@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; + unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone, */ high_pfn = min(low_pfn, pfn); - zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + z_end_pfn = zone_end_pfn(zone); /* * Isolate free pages until enough are available to migrate the @@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone, * only scans within a pageblock */ end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - end_pfn = min(end_pfn, zone_end_pfn); + end_pfn = min(end_pfn, z_end_pfn); isolated = isolate_freepages_block(cc, pfn, end_pfn, freelist, false); nr_freepages += isolated; @@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* Do not cross the free scanner or scan within a memory hole */ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { @@ -816,6 +819,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -850,22 +854,16 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - if (cc->page) { - /* Was a suitable page captured? */ - if (*cc->page) + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; - } else { - unsigned int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; - /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) - return COMPACT_PARTIAL; - } } return COMPACT_CONTINUE; @@ -921,65 +919,11 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -1036,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, false, + (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); update_nr_listpages(cc); @@ -1054,9 +998,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } - - /* Capture a page now if it is a suitable size */ - compact_capture_page(cc); } out: @@ -1069,8 +1010,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended, - struct page **page) + bool sync, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1080,7 +1020,6 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1110,7 +1049,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1136,7 +1075,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended, page); + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -1150,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact all zones within a node */ -static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) +static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) { int zoneid; struct zone *zone; @@ -1183,34 +1122,30 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); } - - return 0; } -int compact_pgdat(pg_data_t *pgdat, int order) +void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, .sync = false, - .page = NULL, }; - return __compact_pgdat(pgdat, &cc); + __compact_pgdat(pgdat, &cc); } -static int compact_node(int nid) +static void compact_node(int nid) { struct compact_control cc = { .order = -1, .sync = true, - .page = NULL, }; - return __compact_pgdat(NODE_DATA(nid), &cc); + __compact_pgdat(NODE_DATA(nid), &cc); } /* Compact all nodes in the system */ -static int compact_nodes(void) +static void compact_nodes(void) { int nid; @@ -1219,8 +1154,6 @@ static int compact_nodes(void) for_each_online_node(nid) compact_node(nid); - - return COMPACT_COMPLETE; } /* The written value is actually unused, all memory is compacted */ @@ -1231,7 +1164,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { if (write) - return compact_nodes(); + compact_nodes(); return 0; } diff --git a/mm/fadvise.c b/mm/fadvise.c index a47f0f50c89f..3bcfd81db45e 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -17,6 +17,7 @@ #include <linux/fadvise.h> #include <linux/writeback.h> #include <linux/syscalls.h> +#include <linux/swap.h> #include <asm/unistd.h> @@ -24,7 +25,7 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) +SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { struct fd f = fdget(fd); struct address_space *mapping; @@ -38,7 +39,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) if (!f.file) return -EBADF; - if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) { + if (S_ISFIFO(file_inode(f.file)->i_mode)) { ret = -ESPIPE; goto out; } @@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index >= start_index) - invalidate_mapping_pages(mapping, start_index, + if (end_index >= start_index) { + unsigned long count = invalidate_mapping_pages(mapping, + start_index, end_index); + + /* + * If fewer pages were invalidated than expected then + * it is possible that some of the pages were on + * a per-cpu pagevec for a remote CPU. Drain all + * pagevecs and try again. + */ + if (count < (end_index - start_index + 1)) { + lru_add_drain_all(); + invalidate_mapping_pages(mapping, start_index, end_index); + } + } break; default: ret = -EINVAL; @@ -131,26 +145,12 @@ out: fdput(f); return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) -{ - return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); -} -SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); -#endif #ifdef __ARCH_WANT_SYS_FADVISE64 -SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) +SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) { return sys_fadvise64_64(fd, offset, len, advice); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) -{ - return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); -} -SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); -#endif #endif diff --git a/mm/filemap.c b/mm/filemap.c index 83efee76a5c0..4b51ac1acae7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,9 @@ #include <linux/cleancache.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/filemap.h> + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ @@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -184,6 +188,17 @@ static int sleep_on_page_killable(void *word) return fatal_signal_pending(current) ? -EINTR : 0; } +static int filemap_check_errors(struct address_space *mapping) +{ + int ret = 0; + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -265,10 +280,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret = 0; + int ret2, ret = 0; if (end_byte < start_byte) - return 0; + goto out; pagevec_init(&pvec, 0); while ((index <= end) && @@ -291,12 +306,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, pagevec_release(&pvec); cond_resched(); } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; +out: + ret2 = filemap_check_errors(mapping); + if (!ret) + ret = ret2; return ret; } @@ -337,6 +350,8 @@ int filemap_write_and_wait(struct address_space *mapping) if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } @@ -368,6 +383,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } @@ -464,6 +481,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); } else { page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ @@ -1521,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (!ra->ra_pages) return; - if (VM_SequentialReadHint(vma)) { + if (vma->vm_flags & VM_SEQ_READ) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -1566,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (ra->mmap_miss > 0) ra->mmap_miss--; @@ -1711,7 +1729,7 @@ EXPORT_SYMBOL(filemap_fault); int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); @@ -1728,6 +1746,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * see the dirty page and writeprotect it again. */ set_page_dirty(page); + wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return ret; @@ -2056,7 +2075,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable); /* * Return the count of just the current iov_iter segment. */ -size_t iov_iter_single_seg_count(struct iov_iter *i) +size_t iov_iter_single_seg_count(const struct iov_iter *i) { const struct iovec *iov = i->iov; if (i->nr_segs == 1) @@ -2274,7 +2293,7 @@ repeat: return NULL; } found: - wait_on_page_writeback(page); + wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -2527,7 +2546,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2539,7 +2557,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - sb_end_write(inode->i_sb); return ret; } EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a912da6ddfd4..28fe26b64f8a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -404,8 +404,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, loff_t pos; ssize_t ret; - sb_start_write(inode->i_sb); - mutex_lock(&inode->i_mutex); if (!access_ok(VERIFY_READ, buf, len)) { @@ -439,7 +437,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, current->backing_dev_info = NULL; out_up: mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); return ret; } EXPORT_SYMBOL_GPL(xip_file_write); diff --git a/mm/fremap.c b/mm/fremap.c index a0aaf0e56800..87da3590c61e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; + vm_flags_t vm_flags = 0; if (prot) return err; @@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within - * the single existing vma. vm_private_data is used as a - * swapout cursor in a VM_NONLINEAR vma. + * the single existing vma. */ if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) - goto out; - if (!vma->vm_ops || !vma->vm_ops->remap_pages) goto out; @@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* Must set VM_NONLINEAR before any pages are populated. */ if (!(vma->vm_flags & VM_NONLINEAR)) { + /* + * vm_private_data is used as a swapout cursor + * in a VM_NONLINEAR vma. + */ + if (vma->vm_private_data) + goto out; + /* Don't need a nonlinear mapping, exit success */ if (pgoff == linear_page_index(vma, start)) { err = 0; @@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } if (!has_write_lock) { +get_write_lock: up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); has_write_lock = 1; @@ -199,9 +204,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long addr; struct file *file = get_file(vma->vm_file); - flags &= MAP_NONBLOCK; addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff); + vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -224,28 +228,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * drop PG_Mlocked flag for over-mapped range */ - vm_flags_t saved_flags = vma->vm_flags; + if (!has_write_lock) + goto get_write_lock; + vm_flags = vma->vm_flags; munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = saved_flags; + vma->vm_flags = vm_flags; } mmu_notifier_invalidate_range_start(mm, start, start + size); err = vma->vm_ops->remap_pages(vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); - if (!err && !(flags & MAP_NONBLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - /* - * might be mapping previously unmapped range of file - */ - mlock_vma_pages_range(vma, start, start + size); - } else { - if (unlikely(has_write_lock)) { - downgrade_write(&mm->mmap_sem); - has_write_lock = 0; - } - make_pages_present(start, start+size); - } - } /* * We can't clear VM_NONLINEAR because we'd have to do @@ -254,10 +246,14 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, */ out: + if (vma) + vm_flags = vma->vm_flags; if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else up_write(&mm->mmap_sem); + if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) + mm_populate(start, size); return err; } diff --git a/mm/frontswap.c b/mm/frontswap.c index 2890e67d6026..1b24bdcb3197 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -24,15 +24,7 @@ * frontswap_ops is set by frontswap_register_ops to contain the pointers * to the frontswap "backend" implementation functions. */ -static struct frontswap_ops frontswap_ops __read_mostly; - -/* - * This global enablement flag reduces overhead on systems where frontswap_ops - * has not been registered, so is preferred to the slower alternative: a - * function call that checks a non-global. - */ -bool frontswap_enabled __read_mostly; -EXPORT_SYMBOL(frontswap_enabled); +static struct frontswap_ops *frontswap_ops __read_mostly; /* * If enabled, frontswap_store will return failure even on success. As @@ -80,16 +72,70 @@ static inline void inc_frontswap_succ_stores(void) { } static inline void inc_frontswap_failed_stores(void) { } static inline void inc_frontswap_invalidates(void) { } #endif + +/* + * Due to the asynchronous nature of the backends loading potentially + * _after_ the swap system has been activated, we have chokepoints + * on all frontswap functions to not call the backend until the backend + * has registered. + * + * Specifically when no backend is registered (nobody called + * frontswap_register_ops) all calls to frontswap_init (which is done via + * swapon -> enable_swap_info -> frontswap_init) are registered and remembered + * (via the setting of need_init bitmap) but fail to create tmem_pools. When a + * backend registers with frontswap at some later point the previous + * calls to frontswap_init are executed (by iterating over the need_init + * bitmap) to create tmem_pools and set the respective poolids. All of that is + * guarded by us using atomic bit operations on the 'need_init' bitmap. + * + * This would not guards us against the user deciding to call swapoff right as + * we are calling the backend to initialize (so swapon is in action). + * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * OK. The other scenario where calls to frontswap_store (called via + * swap_writepage) is racing with frontswap_invalidate_area (called via + * swapoff) is again guarded by the swap subsystem. + * + * While no backend is registered all calls to frontswap_[store|load| + * invalidate_area|invalidate_page] are ignored or fail. + * + * The time between the backend being registered and the swap file system + * calling the backend (via the frontswap_* functions) is indeterminate as + * frontswap_ops is not atomic_t (or a value guarded by a spinlock). + * That is OK as we are comfortable missing some of these calls to the newly + * registered backend. + * + * Obviously the opposite (unloading the backend) must be done after all + * the frontswap_[store|load|invalidate_area|invalidate_page] start + * ignorning or failing the requests - at which point frontswap_ops + * would have to be made in some fashion atomic. + */ +static DECLARE_BITMAP(need_init, MAX_SWAPFILES); + /* * Register operations for frontswap, returning previous thus allowing * detection of multiple backends and possible nesting. */ -struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) { - struct frontswap_ops old = frontswap_ops; - - frontswap_ops = *ops; - frontswap_enabled = true; + struct frontswap_ops *old = frontswap_ops; + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (test_and_clear_bit(i, need_init)) { + struct swap_info_struct *sis = swap_info[i]; + /* __frontswap_init _should_ have set it! */ + if (!sis->frontswap_map) + return ERR_PTR(-EINVAL); + ops->init(i); + } + } + /* + * We MUST have frontswap_ops set _after_ the frontswap_init's + * have been called. Otherwise __frontswap_store might fail. Hence + * the barrier to make sure compiler does not re-order us. + */ + barrier(); + frontswap_ops = ops; return old; } EXPORT_SYMBOL(frontswap_register_ops); @@ -115,20 +161,48 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); /* * Called when a swap device is swapon'd. */ -void __frontswap_init(unsigned type) +void __frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) + + /* + * p->frontswap is a bitmap that we MUST have to figure out which page + * has gone in frontswap. Without it there is no point of continuing. + */ + if (WARN_ON(!map)) return; - frontswap_ops.init(type); + /* + * Irregardless of whether the frontswap backend has been loaded + * before this function or it will be later, we _MUST_ have the + * p->frontswap set to something valid to work properly. + */ + frontswap_map_set(sis, map); + if (frontswap_ops) + frontswap_ops->init(type); + else { + BUG_ON(type > MAX_SWAPFILES); + set_bit(type, need_init); + } } EXPORT_SYMBOL(__frontswap_init); -static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +bool __frontswap_test(struct swap_info_struct *sis, + pgoff_t offset) +{ + bool ret = false; + + if (frontswap_ops && sis->frontswap_map) + ret = test_bit(offset, sis->frontswap_map); + return ret; +} +EXPORT_SYMBOL(__frontswap_test); + +static inline void __frontswap_clear(struct swap_info_struct *sis, + pgoff_t offset) { - frontswap_clear(sis, offset); + clear_bit(offset, sis->frontswap_map); atomic_dec(&sis->frontswap_pages); } @@ -147,13 +221,20 @@ int __frontswap_store(struct page *page) struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + /* + * Return if no backend registed. + * Don't need to inc frontswap_failed_stores here. + */ + if (!frontswap_ops) + return ret; + BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (frontswap_test(sis, offset)) + if (__frontswap_test(sis, offset)) dup = 1; - ret = frontswap_ops.store(type, offset, page); + ret = frontswap_ops->store(type, offset, page); if (ret == 0) { - frontswap_set(sis, offset); + set_bit(offset, sis->frontswap_map); inc_frontswap_succ_stores(); if (!dup) atomic_inc(&sis->frontswap_pages); @@ -188,13 +269,16 @@ int __frontswap_load(struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (frontswap_test(sis, offset)) - ret = frontswap_ops.load(type, offset, page); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) + ret = frontswap_ops->load(type, offset, page); if (ret == 0) { inc_frontswap_loads(); if (frontswap_tmem_exclusive_gets_enabled) { SetPageDirty(page); - frontswap_clear(sis, offset); + __frontswap_clear(sis, offset); } } return ret; @@ -210,8 +294,11 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) struct swap_info_struct *sis = swap_info[type]; BUG_ON(sis == NULL); - if (frontswap_test(sis, offset)) { - frontswap_ops.invalidate_page(type, offset); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) { + frontswap_ops->invalidate_page(type, offset); __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } @@ -226,12 +313,15 @@ void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; - BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) - return; - frontswap_ops.invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - memset(sis->frontswap_map, 0, sis->max / sizeof(long)); + if (frontswap_ops) { + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + frontswap_ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); + } + clear_bit(type, need_init); } EXPORT_SYMBOL(__frontswap_invalidate_area); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9e894edc7811..243e710c6039 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -20,6 +20,7 @@ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/migrate.h> +#include <linux/hashtable.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); -static int mm_slots_hash_init(void); static int khugepaged_slab_init(void); -static void khugepaged_slab_free(void); -#define MM_SLOTS_HASH_HEADS 1024 -static struct hlist_head *mm_slots_hash __read_mostly; +#define MM_SLOTS_HASH_BITS 10 +static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + static struct kmem_cache *mm_slot_cache __read_mostly; /** @@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void) struct zone *zone; int nr_zones = 0; unsigned long recommended_min; - extern int min_free_kbytes; if (!khugepaged_enabled()) return 0; @@ -164,35 +163,34 @@ static int start_khugepaged(void) } static atomic_t huge_zero_refcount; -static unsigned long huge_zero_pfn __read_mostly; +static struct page *huge_zero_page __read_mostly; -static inline bool is_huge_zero_pfn(unsigned long pfn) +static inline bool is_huge_zero_page(struct page *page) { - unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); - return zero_pfn && pfn == zero_pfn; + return ACCESS_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { - return is_huge_zero_pfn(pmd_pfn(pmd)); + return is_huge_zero_page(pmd_page(pmd)); } -static unsigned long get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_pfn); + return ACCESS_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); - return 0; + return NULL; } count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); - if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { + if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); __free_page(zero_page); goto retry; @@ -201,7 +199,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_pfn); + return ACCESS_ONCE(huge_zero_page); } static void put_huge_zero_page(void) @@ -221,9 +219,9 @@ static int shrink_huge_zero_page(struct shrinker *shrink, return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { - unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); - BUG_ON(zero_pfn == 0); - __free_page(__pfn_to_page(zero_pfn)); + struct page *zero_page = xchg(&huge_zero_page, NULL); + BUG_ON(zero_page == NULL); + __free_page(zero_page); } return 0; @@ -634,12 +632,6 @@ static int __init hugepage_init(void) if (err) goto out; - err = mm_slots_hash_init(); - if (err) { - khugepaged_slab_free(); - goto out; - } - register_shrinker(&huge_zero_page_shrinker); /* @@ -720,6 +712,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return VM_FAULT_OOM; clear_huge_page(page, haddr, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ __SetPageUptodate(page); spin_lock(&mm->page_table_lock); @@ -731,15 +728,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, } else { pmd_t entry; entry = mk_huge_pmd(page, vma); - /* - * The spinlocking to take the lru_lock inside - * page_add_new_anon_rmap() acts as a full memory - * barrier to be sure clear_huge_page writes become - * visible after the set_pmd_at() write. - */ page_add_new_anon_rmap(page, vma, haddr); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -772,16 +763,16 @@ static inline struct page *alloc_hugepage(int defrag) static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - unsigned long zero_pfn) + struct page *zero_page) { pmd_t entry; if (!pmd_none(*pmd)) return false; - entry = pfn_pmd(zero_pfn, vma->vm_page_prot); + entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); mm->nr_ptes++; return true; } @@ -802,20 +793,20 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; - unsigned long zero_pfn; + struct page *zero_page; bool set; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_pfn = get_huge_zero_page(); - if (unlikely(!zero_pfn)) { + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { pte_free(mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); goto out; } spin_lock(&mm->page_table_lock); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_pfn); + zero_page); spin_unlock(&mm->page_table_lock); if (!set) { pte_free(mm, pgtable); @@ -894,16 +885,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * a page table. */ if (is_huge_zero_pmd(pmd)) { - unsigned long zero_pfn; + struct page *zero_page; bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ - zero_pfn = get_huge_zero_page(); + zero_page = get_huge_zero_page(); set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, - zero_pfn); + zero_page); BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; @@ -925,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - pgtable_trans_huge_deposit(dst_mm, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -996,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1094,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1257,6 +1248,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) + return ERR_PTR(-EFAULT); + page = pmd_page(*pmd); VM_BUG_ON(!PageHead(page)); if (flags & FOLL_TOUCH) { @@ -1270,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { @@ -1298,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid; int current_nid = -1; bool migrated; - bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1320,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Acquire the page lock to serialise THP migrations */ spin_unlock(&mm->page_table_lock); lock_page(page); - page_locked = true; /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); @@ -1333,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate the THP to the requested node */ migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, - page, target_nid); - if (migrated) - current_nid = target_nid; - else { - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - unlock_page(page); - goto out_unlock; - } - goto clear_pmdnuma; - } + pmdp, pmd, addr, page, target_nid); + if (!migrated) + goto check_same; - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(target_nid, HPAGE_PMD_NR, true); return 0; +check_same: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); - if (page_locked) - unlock_page(page); - out_unlock: spin_unlock(&mm->page_table_lock); if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; } @@ -1373,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm); + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); @@ -1444,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (ret == 1) { pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(&mm->page_table_lock); } out: @@ -1573,7 +1566,8 @@ static int __split_huge_page_splitting(struct page *page, return ret; } -static void __split_huge_page_refcount(struct page *page) +static void __split_huge_page_refcount(struct page *page, + struct list_head *list) { int i; struct zone *zone = page_zone(page); @@ -1652,14 +1646,14 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_xchg_last_nid(page_tail, page_last_nid(page)); + page_nid_xchg_last(page_tail, page_nid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - lru_add_page_tail(page, page_tail, lruvec); + lru_add_page_tail(page, page_tail, lruvec, list); } atomic_sub(tail_count, &page->_count); BUG_ON(atomic_read(&page->_count) <= 0); @@ -1705,7 +1699,7 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); haddr = address; @@ -1766,7 +1760,8 @@ static int __split_huge_page_map(struct page *page, /* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma) + struct anon_vma *anon_vma, + struct list_head *list) { int mapcount, mapcount2; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1797,7 +1792,7 @@ static void __split_huge_page(struct page *page, mapcount, page_mapcount(page)); BUG_ON(mapcount != page_mapcount(page)); - __split_huge_page_refcount(page); + __split_huge_page_refcount(page, list); mapcount2 = 0; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { @@ -1812,27 +1807,45 @@ static void __split_huge_page(struct page *page, BUG_ON(mapcount != mapcount2); } -int split_huge_page(struct page *page) +/* + * Split a hugepage into normal pages. This doesn't change the position of head + * page. If @list is null, tail pages will be added to LRU list, otherwise, to + * @list. Both head page and tail pages will inherit mapping, flags, and so on + * from the hugepage. + * Return 0 if the hugepage is split successfully otherwise return 1. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) { struct anon_vma *anon_vma; int ret = 1; - BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); + BUG_ON(is_huge_zero_page(page)); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma_read(page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(page); if (!anon_vma) goto out; + anon_vma_lock_write(anon_vma); + ret = 0; if (!PageCompound(page)) goto out_unlock; BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma); + __split_huge_page(page, anon_vma, list); count_vm_event(THP_SPLIT); BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma_read(anon_vma); + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); out: return ret; } @@ -1893,12 +1906,6 @@ static int __init khugepaged_slab_init(void) return 0; } -static void __init khugepaged_slab_free(void) -{ - kmem_cache_destroy(mm_slot_cache); - mm_slot_cache = NULL; -} - static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -1911,47 +1918,22 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) kmem_cache_free(mm_slot_cache, mm_slot); } -static int __init mm_slots_hash_init(void) -{ - mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), - GFP_KERNEL); - if (!mm_slots_hash) - return -ENOMEM; - return 0; -} - -#if 0 -static void __init mm_slots_hash_free(void) -{ - kfree(mm_slots_hash); - mm_slots_hash = NULL; -} -#endif - static struct mm_slot *get_mm_slot(struct mm_struct *mm) { struct mm_slot *mm_slot; - struct hlist_head *bucket; - struct hlist_node *node; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; - hlist_for_each_entry(mm_slot, node, bucket, hash) { + hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) if (mm == mm_slot->mm) return mm_slot; - } + return NULL; } static void insert_to_mm_slots_hash(struct mm_struct *mm, struct mm_slot *mm_slot) { - struct hlist_head *bucket; - - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; mm_slot->mm = mm; - hlist_add_head(&mm_slot->hash, bucket); + hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } static inline int khugepaged_test_exit(struct mm_struct *mm) @@ -2020,7 +2002,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hlist_del(&mm_slot->hash); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); free = 1; } @@ -2351,9 +2333,14 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(&mm->page_table_lock); - anon_vma_unlock(vma->anon_vma); + anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2361,7 +2348,7 @@ static void collapse_huge_page(struct mm_struct *mm, * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ - anon_vma_unlock(vma->anon_vma); + anon_vma_unlock_write(vma->anon_vma); __collapse_huge_page_copy(pte, new_page, vma, address, ptl); pte_unmap(pte); @@ -2380,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; @@ -2408,7 +2395,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; - int node = -1; + int node = NUMA_NO_NODE; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -2438,7 +2425,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * be more sophisticated and look at more pages, * but isn't for now. */ - if (node == -1) + if (node == NUMA_NO_NODE) node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) @@ -2469,7 +2456,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) if (khugepaged_test_exit(mm)) { /* free mm_slot */ - hlist_del(&mm_slot->hash); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); /* @@ -2688,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4f3ea0b1e57c..83aff0a4d093 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -127,7 +127,7 @@ static inline struct hugepage_subpool *subpool_inode(struct inode *inode) static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { - return subpool_inode(vma->vm_file->f_dentry->d_inode); + return subpool_inode(file_inode(vma->vm_file)); } /* @@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) hstate = hstate_vma(vma); - return 1UL << (hstate->order + PAGE_SHIFT); + return 1UL << huge_page_shift(hstate); } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); @@ -690,6 +690,23 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); +pgoff_t __basepage_index(struct page *page) +{ + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + + if (!PageHuge(page_head)) + return page_index(page); + + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else + compound_idx = page - page_head; + + return (index << compound_order(page_head)) + compound_idx; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -1246,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (h->order > (MAX_ORDER - 1)) - totalram_pages += 1 << h->order; + adjust_managed_page_count(page, 1 << h->order); } } @@ -1293,8 +1310,7 @@ static void __init report_hugepages(void) for_each_hstate(h) { char buf[32]; - printk(KERN_INFO "HugeTLB registered %s page size, " - "pre-allocated %ld pages\n", + pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", memfmt(buf, huge_page_size(h)), h->free_huge_pages); } @@ -1702,8 +1718,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - printk(KERN_ERR "Hugetlb: Unable to add hstate %s", - h->name); + pr_err("Hugetlb: Unable to add hstate %s", h->name); } } @@ -1763,7 +1778,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -void hugetlb_unregister_node(struct node *node) +static void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -1807,7 +1822,7 @@ static void hugetlb_unregister_all_nodes(void) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -void hugetlb_register_node(struct node *node) +static void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -1826,9 +1841,8 @@ void hugetlb_register_node(struct node *node) nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { - printk(KERN_ERR "Hugetlb: Unable to add hstate %s" - " for node %d\n", - h->name, node->dev.id); + pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + h->name, node->dev.id); hugetlb_unregister_node(node); break; } @@ -1924,7 +1938,7 @@ void __init hugetlb_add_hstate(unsigned order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + pr_warning("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -1960,8 +1974,8 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - printk(KERN_WARNING "hugepages= specified twice without " - "interleaving hugepagesz=, ignoring\n"); + pr_warning("hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); return 1; } @@ -2124,11 +2138,30 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, h->surplus_huge_pages_node[nid]); } +void hugetlb_show_meminfo(void) +{ + struct hstate *h; + int nid; + + for_each_node_state(nid, N_MEMORY) + for_each_hstate(h) + pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) @@ -2246,10 +2279,11 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, pte_t entry; if (writable) { - entry = - pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, + vma->vm_page_prot))); } else { - entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = huge_pte_wrprotect(mk_huge_pte(page, + vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); @@ -2263,7 +2297,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, { pte_t entry; - entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } @@ -2378,7 +2412,7 @@ again: * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { - pte_clear(mm, address, ptep); + huge_pte_clear(mm, address, ptep); continue; } @@ -2402,7 +2436,7 @@ again: pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_tlb_entry(tlb, ptep, address); - if (pte_dirty(pte)) + if (huge_pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page); @@ -2482,7 +2516,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + mapping = file_inode(vma->vm_file)->i_mapping; /* * Take the mapping lock for the duration of the table walk. As @@ -2692,9 +2726,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { - printk(KERN_WARNING - "PID %d killed due to inadequate hugepage pool\n", - current->pid); + pr_warning("PID %d killed due to inadequate hugepage pool\n", + current->pid); return ret; } @@ -2823,7 +2856,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2856,7 +2889,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -2886,12 +2919,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { - if (!pte_write(entry)) { + if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); goto out_page_table_lock; } - entry = pte_mkdirty(entry); + entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, @@ -2915,23 +2948,14 @@ out_mutex: return ret; } -/* Can be overriden by architectures */ -__attribute__((weak)) struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - BUG(); - return NULL; -} - -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i, - unsigned int flags) +long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; - int remainder = *length; + unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); spin_lock(&mm->page_table_lock); @@ -2961,8 +2985,19 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (absent || - ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || + ((flags & FOLL_WRITE) && + !huge_pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); @@ -3001,7 +3036,7 @@ same_page: } } spin_unlock(&mm->page_table_lock); - *length = remainder; + *nr_pages = remainder; *position = vaddr; return i ? i : -EFAULT; @@ -3032,7 +3067,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(pte_modify(pte, newprot)); + pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte = arch_make_huge_pte(pte, vma, NULL, 0); set_huge_pte_at(mm, address, ptep, pte); pages++; } @@ -3141,6 +3177,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static unsigned long page_table_shareable(struct vm_area_struct *svma, + struct vm_area_struct *vma, + unsigned long addr, pgoff_t idx) +{ + unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + + svma->vm_start; + unsigned long sbase = saddr & PUD_MASK; + unsigned long s_end = sbase + PUD_SIZE; + + /* Allow segments to share if only one is marked locked */ + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + + /* + * match the virtual addresses, permission and the alignment of the + * page table page. + */ + if (pmd_index(addr) != pmd_index(saddr) || + vm_flags != svm_flags || + sbase < svma->vm_start || svma->vm_end < s_end) + return 0; + + return saddr; +} + +static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & PUD_MASK; + unsigned long end = base + PUD_SIZE; + + /* + * check on proper vm_flags and page table alignment + */ + if (vma->vm_flags & VM_MAYSHARE && + vma->vm_start <= base && end <= vma->vm_end) + return 1; + return 0; +} + +/* + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_mutex section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. + */ +pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; + pte_t *pte; + + if (!vma_shareable(vma, addr)) + return (pte_t *)pmd_alloc(mm, pud, addr); + + mutex_lock(&mapping->i_mmap_mutex); + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { + if (svma == vma) + continue; + + saddr = page_table_shareable(svma, vma, addr, idx); + if (saddr) { + spte = huge_pte_offset(svma->vm_mm, saddr); + if (spte) { + get_page(virt_to_page(spte)); + break; + } + } + } + + if (!spte) + goto out; + + spin_lock(&mm->page_table_lock); + if (pud_none(*pud)) + pud_populate(mm, pud, + (pmd_t *)((unsigned long)spte & PAGE_MASK)); + else + put_page(virt_to_page(spte)); + spin_unlock(&mm->page_table_lock); +out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); + mutex_unlock(&mapping->i_mmap_mutex); + return pte; +} + +/* + * unmap huge page backed by shared pte. + * + * Hugetlb pte page is ref counted at the time of mapping. If pte is shared + * indicated by page_count > 1, unmap is achieved by clearing pud and + * decrementing the ref count. If count == 1, the pte page is not shared. + * + * called with vma->vm_mm->page_table_lock held. + * + * returns: 1 successfully unmapped a shared pte page + * 0 the underlying pte page is not shared, or it is the last user + */ +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + pgd_t *pgd = pgd_offset(mm, *addr); + pud_t *pud = pud_offset(pgd, *addr); + + BUG_ON(page_count(virt_to_page(ptep)) == 0); + if (page_count(virt_to_page(ptep)) == 1) + return 0; + + pud_clear(pud); + put_page(virt_to_page(ptep)); + *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + return 1; +} +#define want_pmd_share() (1) +#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + return NULL; +} +#define want_pmd_share() (0) +#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (pud) { + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (want_pmd_share() && pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } + } + BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); + + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (pud_present(*pud)) { + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + } + } + return (pte_t *) pmd; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pmd); + if (page) + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); + return page; +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pud); + if (page) + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); + return page; +} + +#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + +/* Can be overriden by architectures */ +__attribute__((weak)) struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + +#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + #ifdef CONFIG_MEMORY_FAILURE /* Should be called in hugetlb_lock */ diff --git a/mm/internal.h b/mm/internal.h index d597f94cc205..4390ac6c106e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __put_page(struct page *page) -{ - atomic_dec(&page->_count); -} - static inline void __get_page_tail_foll(struct page *page, bool get_page_head) { @@ -135,7 +130,6 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ - struct page **page; /* Page captured of requested size */ }; unsigned long @@ -163,8 +157,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); +extern long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) @@ -196,7 +190,7 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +extern unsigned int munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 752a705c77c2..c8d7f3110fd0 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -436,7 +436,7 @@ static int get_object(struct kmemleak_object *object) */ static void free_object_rcu(struct rcu_head *rcu) { - struct hlist_node *elem, *tmp; + struct hlist_node *tmp; struct kmemleak_scan_area *area; struct kmemleak_object *object = container_of(rcu, struct kmemleak_object, rcu); @@ -445,8 +445,8 @@ static void free_object_rcu(struct rcu_head *rcu) * Once use_count is 0 (guaranteed by put_object), there is no other * code accessing this object, hence no need for locking. */ - hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { - hlist_del(elem); + hlist_for_each_entry_safe(area, tmp, &object->area_list, node) { + hlist_del(&area->node); kmem_cache_free(scan_area_cache, area); } kmem_cache_free(object_cache, object); @@ -1177,7 +1177,6 @@ static void scan_block(void *_start, void *_end, static void scan_object(struct kmemleak_object *object) { struct kmemleak_scan_area *area; - struct hlist_node *elem; unsigned long flags; /* @@ -1205,7 +1204,7 @@ static void scan_object(struct kmemleak_object *object) spin_lock_irqsave(&object->lock, flags); } } else - hlist_for_each_entry(area, elem, &object->area_list, node) + hlist_for_each_entry(area, &object->area_list, node) scan_block((void *)area->start, (void *)(area->start + area->size), object, 0); @@ -1300,9 +1299,8 @@ static void kmemleak_scan(void) */ lock_memory_hotplug(); for_each_online_node(i) { - pg_data_t *pgdat = NODE_DATA(i); - unsigned long start_pfn = pgdat->node_start_pfn; - unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long start_pfn = node_start_pfn(i); + unsigned long end_pfn = node_end_pfn(i); unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { @@ -33,13 +33,22 @@ #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> -#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/tlbflush.h> #include "internal.h" +#ifdef CONFIG_NUMA +#define NUMA(x) (x) +#define DO_NUMA(x) do { (x); } while (0) +#else +#define NUMA(x) (0) +#define DO_NUMA(x) do { } while (0) +#endif + /* * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: @@ -78,6 +87,9 @@ * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) + * + * If the merge_across_nodes tunable is unset, then KSM maintains multiple + * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** @@ -113,19 +125,32 @@ struct ksm_scan { /** * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page - * @kpfn: page frame number of this ksm page + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { - struct rb_node node; + union { + struct rb_node node; /* when node of stable tree */ + struct { /* when listed for migration */ + struct list_head *head; + struct list_head list; + }; + }; struct hlist_head hlist; unsigned long kpfn; +#ifdef CONFIG_NUMA + int nid; +#endif }; /** * struct rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address @@ -135,7 +160,12 @@ struct stable_node { */ struct rmap_item { struct rmap_item *rmap_list; - struct anon_vma *anon_vma; /* when stable */ + union { + struct anon_vma *anon_vma; /* when stable */ +#ifdef CONFIG_NUMA + int nid; /* when node of unstable tree */ +#endif + }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ @@ -153,12 +183,16 @@ struct rmap_item { #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ -static struct rb_root root_stable_tree = RB_ROOT; -static struct rb_root root_unstable_tree = RB_ROOT; +static struct rb_root one_stable_tree[1] = { RB_ROOT }; +static struct rb_root one_unstable_tree[1] = { RB_ROOT }; +static struct rb_root *root_stable_tree = one_stable_tree; +static struct rb_root *root_unstable_tree = one_unstable_tree; -#define MM_SLOTS_HASH_SHIFT 10 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; +/* Recently migrated nodes of stable tree, pending proper placement */ +static LIST_HEAD(migrate_nodes); + +#define MM_SLOTS_HASH_BITS 10 +static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), @@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +#ifdef CONFIG_NUMA +/* Zeroed when merging across nodes is not allowed */ +static unsigned int ksm_merge_across_nodes = 1; +static int ksm_nr_node_ids = 1; +#else +#define ksm_merge_across_nodes 1U +#define ksm_nr_node_ids 1 +#endif + #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 -static unsigned int ksm_run = KSM_RUN_STOP; +#define KSM_RUN_OFFLINE 4 +static unsigned long ksm_run = KSM_RUN_STOP; +static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); @@ -275,31 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) static struct mm_slot *get_mm_slot(struct mm_struct *mm) { - struct mm_slot *mm_slot; - struct hlist_head *bucket; - struct hlist_node *node; + struct mm_slot *slot; + + hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) + if (slot->mm == mm) + return slot; - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; - hlist_for_each_entry(mm_slot, node, bucket, link) { - if (mm == mm_slot->mm) - return mm_slot; - } return NULL; } static void insert_to_mm_slots_hash(struct mm_struct *mm, struct mm_slot *mm_slot) { - struct hlist_head *bucket; - - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; mm_slot->mm = mm; - hlist_add_head(&mm_slot->link, bucket); -} - -static inline int in_stable_tree(struct rmap_item *rmap_item) -{ - return rmap_item->address & STABLE_FLAG; + hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); } /* @@ -333,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET); + page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) @@ -447,12 +481,22 @@ out: page = NULL; return page; } +/* + * This helper is used for getting right index into array of tree roots. + * When merge_across_nodes knob is set to 1, there are only two rb-trees for + * stable and unstable pages from all nodes with roots in index 0. Otherwise, + * every node has its own stable and unstable tree. + */ +static inline int get_kpfn_nid(unsigned long kpfn) +{ + return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; - struct hlist_node *hlist; - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) ksm_pages_sharing--; else @@ -462,7 +506,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) cond_resched(); } - rb_erase(&stable_node->node, &root_stable_tree); + if (stable_node->head == &migrate_nodes) + list_del(&stable_node->list); + else + rb_erase(&stable_node->node, + root_stable_tree + NUMA(stable_node->nid)); free_stable_node(stable_node); } @@ -472,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. + * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for @@ -482,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. - * - * include/linux/pagemap.h page_cache_get_speculative() is a good reference, - * but this is different - made simpler by ksm_thread_mutex being held, but - * interesting for assuming that no other use of the struct page could ever - * put our expected_mapping into page->mapping (or a field of the union which - * coincides with page->mapping). The RCU calls are not for KSM at all, but - * to keep the page_count protocol described with page_cache_get_speculative. - * - * Note: it is possible that get_ksm_page() will return NULL one moment, - * then page the next, if the page is in between page_freeze_refs() and - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node) +static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) { struct page *page; void *expected_mapping; + unsigned long kpfn; - page = pfn_to_page(stable_node->kpfn); expected_mapping = (void *)stable_node + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); - rcu_read_lock(); - if (page->mapping != expected_mapping) - goto stale; - if (!get_page_unless_zero(page)) +again: + kpfn = ACCESS_ONCE(stable_node->kpfn); + page = pfn_to_page(kpfn); + + /* + * page is computed from kpfn, so on most architectures reading + * page->mapping is naturally ordered after reading node->kpfn, + * but on Alpha we need to be more careful. + */ + smp_read_barrier_depends(); + if (ACCESS_ONCE(page->mapping) != expected_mapping) goto stale; - if (page->mapping != expected_mapping) { + + /* + * We cannot do anything with the page while its refcount is 0. + * Usually 0 means free, or tail of a higher-order page: in which + * case this node is no longer referenced, and should be freed; + * however, it might mean that the page is under page_freeze_refs(). + * The __remove_mapping() case is easy, again the node is now stale; + * but if page is swapcache in migrate_page_move_mapping(), it might + * still be our page, in which case it's essential to keep the node. + */ + while (!get_page_unless_zero(page)) { + /* + * Another check for page->mapping != expected_mapping would + * work here too. We have chosen the !PageSwapCache test to + * optimize the common case, when the page is or is about to + * be freed: PageSwapCache is cleared (under spin_lock_irq) + * in the freeze_refs section of __remove_mapping(); but Anon + * page->mapping reset to NULL later, in free_pages_prepare(). + */ + if (!PageSwapCache(page)) + goto stale; + cpu_relax(); + } + + if (ACCESS_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } - rcu_read_unlock(); + + if (lock_it) { + lock_page(page); + if (ACCESS_ONCE(page->mapping) != expected_mapping) { + unlock_page(page); + put_page(page); + goto stale; + } + } return page; + stale: - rcu_read_unlock(); + /* + * We come here from above when page->mapping or !PageSwapCache + * suggests that the node is stale; but it might be under migration. + * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), + * before checking whether node->kpfn has been changed. + */ + smp_rmb(); + if (ACCESS_ONCE(stable_node->kpfn) != kpfn) + goto again; remove_node_from_stable_tree(stable_node); return NULL; } @@ -531,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node); + page = get_ksm_page(stable_node, true); if (!page) goto out; - lock_page(page); hlist_del(&rmap_item->hlist); unlock_page(page); put_page(page); @@ -560,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) - rb_erase(&rmap_item->node, &root_unstable_tree); - + rb_erase(&rmap_item->node, + root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } @@ -581,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, } /* - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing @@ -614,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, /* * Only called through the sysfs control interface: */ +static int remove_stable_node(struct stable_node *stable_node) +{ + struct page *page; + int err; + + page = get_ksm_page(stable_node, true); + if (!page) { + /* + * get_ksm_page did remove_node_from_stable_tree itself. + */ + return 0; + } + + if (WARN_ON_ONCE(page_mapped(page))) { + /* + * This should not happen: but if it does, just refuse to let + * merge_across_nodes be switched - there is no need to panic. + */ + err = -EBUSY; + } else { + /* + * The stable node did not yet appear stale to get_ksm_page(), + * since that allows for an unmapped ksm page to be recognized + * right up until it is freed; but the node is safe to remove. + * This page might be in a pagevec waiting to be freed, + * or it might be PageSwapCache (perhaps under writeback), + * or it might have been removed from swapcache a moment ago. + */ + set_page_stable_node(page, NULL); + remove_node_from_stable_tree(stable_node); + err = 0; + } + + unlock_page(page); + put_page(page); + return err; +} + +static int remove_all_stable_nodes(void) +{ + struct stable_node *stable_node; + struct list_head *this, *next; + int nid; + int err = 0; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + while (root_stable_tree[nid].rb_node) { + stable_node = rb_entry(root_stable_tree[nid].rb_node, + struct stable_node, node); + if (remove_stable_node(stable_node)) { + err = -EBUSY; + break; /* proceed to next nid */ + } + cond_resched(); + } + } + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, struct stable_node, list); + if (remove_stable_node(stable_node)) + err = -EBUSY; + cond_resched(); + } + return err; +} + static int unmerge_and_remove_all_rmap_items(void) { struct mm_slot *mm_slot; @@ -647,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void) ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, struct mm_slot, mm_list); if (ksm_test_exit(mm)) { - hlist_del(&mm_slot->link); + hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); spin_unlock(&ksm_mmlist_lock); @@ -661,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void) } } + /* Clean up stable nodes, but don't worry if some are still busy */ + remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; @@ -946,6 +1098,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, if (err) goto out; + /* Unstable nid is in union with stable anon_vma: remove first */ + remove_rmap_item_from_tree(rmap_item); + /* Must get reference to anon_vma while still holding mmap_sem */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); @@ -996,42 +1151,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, */ static struct page *stable_tree_search(struct page *page) { - struct rb_node *node = root_stable_tree.rb_node; + int nid; + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent; struct stable_node *stable_node; + struct stable_node *page_node; - stable_node = page_stable_node(page); - if (stable_node) { /* ksm page forked */ + page_node = page_stable_node(page); + if (page_node && page_node->head != &migrate_nodes) { + /* ksm page forked */ get_page(page); return page; } - while (node) { + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_stable_tree + nid; +again: + new = &root->rb_node; + parent = NULL; + + while (*new) { struct page *tree_page; int ret; cond_resched(); - stable_node = rb_entry(node, struct stable_node, node); - tree_page = get_ksm_page(stable_node); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node, false); if (!tree_page) return NULL; ret = memcmp_pages(page, tree_page); + put_page(tree_page); - if (ret < 0) { - put_page(tree_page); - node = node->rb_left; - } else if (ret > 0) { - put_page(tree_page); - node = node->rb_right; - } else - return tree_page; + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * Lock and unlock the stable_node's page (which + * might already have been migrated) so that page + * migration is sure to notice its raised count. + * It would be more elegant to return stable_node + * than kpage, but that involves more changes. + */ + tree_page = get_ksm_page(stable_node, true); + if (tree_page) { + unlock_page(tree_page); + if (get_kpfn_nid(stable_node->kpfn) != + NUMA(stable_node->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; + } + /* + * There is now a place for page_node, but the tree may + * have been rebalanced, so re-evaluate parent and new. + */ + if (page_node) + goto again; + return NULL; + } } - return NULL; + if (!page_node) + return NULL; + + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_link_node(&page_node->node, parent, new); + rb_insert_color(&page_node->node, root); + get_page(page); + return page; + +replace: + if (page_node) { + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node->node, &page_node->node, root); + get_page(page); + } else { + rb_erase(&stable_node->node, root); + page = NULL; + } + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + return page; } /* - * stable_tree_insert - insert rmap_item pointing to new ksm page + * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, @@ -1039,17 +1251,25 @@ static struct page *stable_tree_search(struct page *page) */ static struct stable_node *stable_tree_insert(struct page *kpage) { - struct rb_node **new = &root_stable_tree.rb_node; + int nid; + unsigned long kpfn; + struct rb_root *root; + struct rb_node **new; struct rb_node *parent = NULL; struct stable_node *stable_node; + kpfn = page_to_pfn(kpage); + nid = get_kpfn_nid(kpfn); + root = root_stable_tree + nid; + new = &root->rb_node; + while (*new) { struct page *tree_page; int ret; cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node); + tree_page = get_ksm_page(stable_node, false); if (!tree_page) return NULL; @@ -1075,13 +1295,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage) if (!stable_node) return NULL; - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, &root_stable_tree); - INIT_HLIST_HEAD(&stable_node->hlist); - - stable_node->kpfn = page_to_pfn(kpage); + stable_node->kpfn = kpfn; set_page_stable_node(kpage, stable_node); + DO_NUMA(stable_node->nid = nid); + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, root); return stable_node; } @@ -1104,10 +1323,15 @@ static struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, struct page *page, struct page **tree_pagep) - { - struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node **new; + struct rb_root *root; struct rb_node *parent = NULL; + int nid; + + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_unstable_tree + nid; + new = &root->rb_node; while (*new) { struct rmap_item *tree_rmap_item; @@ -1137,6 +1361,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; + } else if (!ksm_merge_across_nodes && + page_to_nid(tree_page) != nid) { + /* + * If tree_page has been migrated to another NUMA node, + * it will be flushed out and put in the right unstable + * tree next time: only merge with it when across_nodes. + */ + put_page(tree_page); + return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; @@ -1145,8 +1378,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, &root_unstable_tree); + rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; @@ -1188,10 +1422,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) unsigned int checksum; int err; - remove_rmap_item_from_tree(rmap_item); + stable_node = page_stable_node(page); + if (stable_node) { + if (stable_node->head != &migrate_nodes && + get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { + rb_erase(&stable_node->node, + root_stable_tree + NUMA(stable_node->nid)); + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + } + if (stable_node->head != &migrate_nodes && + rmap_item->head == stable_node) + return; + } /* We first start with searching the page inside the stable tree */ kpage = stable_tree_search(page); + if (kpage == page && rmap_item->head == stable_node) { + put_page(kpage); + return; + } + + remove_rmap_item_from_tree(rmap_item); + if (kpage) { err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { @@ -1225,14 +1478,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); put_page(tree_page); - /* - * As soon as we merge this page, we want to remove the - * rmap_item of the page we have merged with from the unstable - * tree, and insert it instead as new node in the stable tree. - */ if (kpage) { - remove_rmap_item_from_tree(tree_rmap_item); - + /* + * The pages were successfully merged: insert new + * node in the stable tree and add both rmap_items. + */ lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { @@ -1289,6 +1539,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + int nid; if (list_empty(&ksm_mm_head.mm_list)) return NULL; @@ -1307,7 +1558,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) */ lru_add_drain_all(); - root_unstable_tree = RB_ROOT; + /* + * Whereas stale stable_nodes on the stable_tree itself + * get pruned in the regular course of stable_tree_search(), + * those moved out to the migrate_nodes list can accumulate: + * so prune them once before each full scan. + */ + if (!ksm_merge_across_nodes) { + struct stable_node *stable_node; + struct list_head *this, *next; + struct page *page; + + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, + struct stable_node, list); + page = get_ksm_page(stable_node, false); + if (page) + put_page(page); + cond_resched(); + } + } + + for (nid = 0; nid < ksm_nr_node_ids; nid++) + root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); @@ -1392,7 +1665,7 @@ next_mm: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_sem then protects against race with MADV_MERGEABLE). */ - hlist_del(&slot->link); + hash_del(&slot->link); list_del(&slot->mm_list); spin_unlock(&ksm_mmlist_lock); @@ -1428,8 +1701,7 @@ static void ksm_do_scan(unsigned int scan_npages) rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; - if (!PageKsm(page) || !in_stable_tree(rmap_item)) - cmp_and_merge_page(page, rmap_item); + cmp_and_merge_page(page, rmap_item); put_page(page); } } @@ -1446,6 +1718,7 @@ static int ksm_scan_thread(void *nothing) while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); @@ -1525,11 +1798,19 @@ int __ksm_enter(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); /* - * Insert just behind the scanning cursor, to let the area settle + * When KSM_RUN_MERGE (or KSM_RUN_STOP), + * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. + * + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its + * scanning cursor, otherwise KSM pages in newly forked mms will be + * missed: then we might as well insert at the end of the list. */ - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + if (ksm_run & KSM_RUN_UNMERGE) + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + else + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -1559,7 +1840,7 @@ void __ksm_exit(struct mm_struct *mm) mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hlist_del(&mm_slot->link); + hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); easy_to_free = 1; } else { @@ -1579,24 +1860,32 @@ void __ksm_exit(struct mm_struct *mm) } } -struct page *ksm_does_need_to_copy(struct page *page, +struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct anon_vma *anon_vma = page_anon_vma(page); struct page *new_page; + if (PageKsm(page)) { + if (page_stable_node(page) && + !(ksm_run & KSM_RUN_UNMERGE)) + return page; /* no need to copy it */ + } else if (!anon_vma) { + return page; /* no need to copy it */ + } else if (anon_vma->root == vma->anon_vma->root && + page->index == linear_page_index(vma, address)) { + return page; /* still no need to copy it */ + } + if (!PageUptodate(page)) + return page; /* let do_swap_page report the error */ + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); __SetPageUptodate(new_page); - SetPageSwapBacked(new_page); __set_page_locked(new_page); - - if (!mlocked_vma_newpage(vma, new_page)) - lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); - else - add_page_to_unevictable_list(new_page); } return new_page; @@ -1607,7 +1896,6 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, { struct stable_node *stable_node; struct rmap_item *rmap_item; - struct hlist_node *hlist; unsigned int mapcount = page_mapcount(page); int referenced = 0; int search_new_forks = 0; @@ -1619,7 +1907,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, if (!stable_node) return 0; again: - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -1661,7 +1949,6 @@ out: int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) { struct stable_node *stable_node; - struct hlist_node *hlist; struct rmap_item *rmap_item; int ret = SWAP_AGAIN; int search_new_forks = 0; @@ -1673,7 +1960,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) if (!stable_node) return SWAP_FAIL; again: - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -1714,7 +2001,6 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct stable_node *stable_node; - struct hlist_node *hlist; struct rmap_item *rmap_item; int ret = SWAP_AGAIN; int search_new_forks = 0; @@ -1726,7 +2012,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, if (!stable_node) return ret; again: - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -1773,64 +2059,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) if (stable_node) { VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); stable_node->kpfn = page_to_pfn(newpage); + /* + * newpage->mapping was set in advance; now we need smp_wmb() + * to make sure that the new stable_node->kpfn is visible + * to get_ksm_page() before it can see that oldpage->mapping + * has gone stale (or that PageSwapCache has been cleared). + */ + smp_wmb(); + set_page_stable_node(oldpage, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE -static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, - unsigned long end_pfn) +static int just_wait(void *word) { - struct rb_node *node; + schedule(); + return 0; +} - for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { - struct stable_node *stable_node; +static void wait_while_offlining(void) +{ + while (ksm_run & KSM_RUN_OFFLINE) { + mutex_unlock(&ksm_thread_mutex); + wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), + just_wait, TASK_UNINTERRUPTIBLE); + mutex_lock(&ksm_thread_mutex); + } +} - stable_node = rb_entry(node, struct stable_node, node); +static void ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct stable_node *stable_node; + struct list_head *this, *next; + struct rb_node *node; + int nid; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + node = rb_first(root_stable_tree + nid); + while (node) { + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + node = rb_first(root_stable_tree + nid); + } else + node = rb_next(node); + cond_resched(); + } + } + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, struct stable_node, list); if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) - return stable_node; + remove_node_from_stable_tree(stable_node); + cond_resched(); } - return NULL; } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; - struct stable_node *stable_node; switch (action) { case MEM_GOING_OFFLINE: /* - * Keep it very simple for now: just lock out ksmd and - * MADV_UNMERGEABLE while any memory is going offline. - * mutex_lock_nested() is necessary because lockdep was alarmed - * that here we take ksm_thread_mutex inside notifier chain - * mutex, and later take notifier chain mutex inside - * ksm_thread_mutex to unlock it. But that's safe because both - * are inside mem_hotplug_mutex. + * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() + * and remove_all_stable_nodes() while memory is going offline: + * it is unsafe for them to touch the stable tree at this time. + * But unmerge_ksm_pages(), rmap lookups and other entry points + * which do not need the ksm_thread_mutex are all safe. */ - mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&ksm_thread_mutex); + ksm_run |= KSM_RUN_OFFLINE; + mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct - * pages which have been offlined: prune those from the tree. + * pages which have been offlined: prune those from the tree, + * otherwise get_ksm_page() might later try to access a + * non-existent struct page. */ - while ((stable_node = ksm_check_stable_tree(mn->start_pfn, - mn->start_pfn + mn->nr_pages)) != NULL) - remove_node_from_stable_tree(stable_node); + ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages); /* fallthrough */ case MEM_CANCEL_OFFLINE: + mutex_lock(&ksm_thread_mutex); + ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } +#else +static void wait_while_offlining(void) +{ +} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_SYSFS @@ -1893,7 +2230,7 @@ KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_run); + return sprintf(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1916,6 +2253,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, */ mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { @@ -1937,6 +2275,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); +#ifdef CONFIG_NUMA +static ssize_t merge_across_nodes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_merge_across_nodes); +} + +static ssize_t merge_across_nodes_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long knob; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_merge_across_nodes != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else if (root_stable_tree == one_stable_tree) { + struct rb_root *buf; + /* + * This is the first time that we switch away from the + * default of merging across nodes: must now allocate + * a buffer to hold as many roots as may be needed. + * Allocate stable and unstable together: + * MAXSMP NODES_SHIFT 10 will use 16kB. + */ + buf = kcalloc(nr_node_ids + nr_node_ids, + sizeof(*buf), GFP_KERNEL | __GFP_ZERO); + /* Let us assume that RB_ROOT is NULL is zero */ + if (!buf) + err = -ENOMEM; + else { + root_stable_tree = buf; + root_unstable_tree = buf + nr_node_ids; + /* Stable tree is empty but not the unstable */ + root_unstable_tree[0] = one_unstable_tree[0]; + } + } + if (!err) { + ksm_merge_across_nodes = knob; + ksm_nr_node_ids = knob ? 1 : nr_node_ids; + } + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(merge_across_nodes); +#endif + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1991,6 +2387,9 @@ static struct attribute *ksm_attrs[] = { &pages_unshared_attr.attr, &pages_volatile_attr.attr, &full_scans_attr.attr, +#ifdef CONFIG_NUMA + &merge_across_nodes_attr.attr, +#endif NULL, }; @@ -2029,10 +2428,7 @@ static int __init ksm_init(void) #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE - /* - * Choose a high priority since the callback takes ksm_thread_mutex: - * later callbacks could only be taking locks which nest within that. - */ + /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, 100); #endif return 0; diff --git a/mm/madvise.c b/mm/madvise.c index 03dfa5c7adb3..7055883e6e25 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -16,6 +16,9 @@ #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/swapops.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -131,6 +134,84 @@ out: return error; } +#ifdef CONFIG_SWAP +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + pte_t *orig_pte; + struct vm_area_struct *vma = walk->private; + unsigned long index; + + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + return 0; + + for (index = start; index != end; index += PAGE_SIZE) { + pte_t pte; + swp_entry_t entry; + struct page *page; + spinlock_t *ptl; + + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + pte = *(orig_pte + ((index - start) / PAGE_SIZE)); + pte_unmap_unlock(orig_pte, ptl); + + if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + continue; + entry = pte_to_swp_entry(pte); + if (unlikely(non_swap_entry(entry))) + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + vma, index); + if (page) + page_cache_release(page); + } + + return 0; +} + +static void force_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_walk walk = { + .mm = vma->vm_mm, + .pmd_entry = swapin_walk_pmd_entry, + .private = vma, + }; + + walk_page_range(start, end, &walk); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +static void force_shm_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct address_space *mapping) +{ + pgoff_t index; + struct page *page; + swp_entry_t swap; + + for (; start < end; start += PAGE_SIZE) { + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + page = find_get_page(mapping, index); + if (!radix_tree_exceptional_entry(page)) { + if (page) + page_cache_release(page); + continue; + } + swap = radix_to_swp_entry(page); + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, + NULL, 0); + if (page) + page_cache_release(page); + } + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} +#endif /* CONFIG_SWAP */ + /* * Schedule all required I/O operations. Do not wait for completion. */ @@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma, { struct file *file = vma->vm_file; +#ifdef CONFIG_SWAP + if (!file || mapping_cap_swap_backed(file->f_mapping)) { + *prev = vma; + if (!file) + force_swapin_readahead(vma, start, end); + else + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#endif + if (!file) return -EBADF; @@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int error = -EINVAL; int write; size_t len; + struct blk_plug plug; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -379,27 +473,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else - down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) - goto out; + return error; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - goto out; + return error; end = start + len; if (end < start) - goto out; + return error; error = 0; if (end == start) - goto out; + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); /* * If the interval [start,end) covers some unmapped address @@ -410,6 +504,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (vma && start > vma->vm_start) prev = vma; + blk_start_plug(&plug); for (;;) { /* Still start < end. */ error = -ENOMEM; @@ -445,6 +540,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) vma = find_vma(current->mm, start); } out: + blk_finish_plug(&plug); if (write) up_write(¤t->mm->mmap_sem); else diff --git a/mm/memblock.c b/mm/memblock.c index 625905523c2a..a847bfe6f3ba 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -314,17 +314,19 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) } this->size += next->size; - memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); + /* move forward from next + 1, index of which is i + 2 */ + memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); type->cnt--; } } /** * memblock_insert_region - insert new memblock region - * @type: memblock type to insert into - * @idx: index for the insertion point - * @base: base address of the new region - * @size: size of the new region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * @nid: node id of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. @@ -564,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %MAX_NUMNODES for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -770,6 +772,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; + if (WARN_ON(!align)) + align = __alignof__(long long); + /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); @@ -827,6 +832,23 @@ phys_addr_t __init memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) +{ + unsigned long pages = 0; + struct memblock_region *r; + unsigned long start_pfn, end_pfn; + + for_each_memblock(memory, r) { + start_pfn = memblock_region_memory_base_pfn(r); + end_pfn = memblock_region_memory_end_pfn(r); + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + pages += end_pfn - start_pfn; + } + + return (phys_addr_t)pages << PAGE_SHIFT; +} + /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09255ec8159c..d12ca6f3c293 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -49,6 +49,7 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <linux/vmpressure.h> #include <linux/mm_inline.h> #include <linux/page_cgroup.h> #include <linux/cpu.h> @@ -91,16 +92,18 @@ enum mem_cgroup_stat_index { /* * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ + MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; static const char * const mem_cgroup_stat_names[] = { "cache", "rss", + "rss_huge", "mapped_file", "swap", }; @@ -120,6 +123,14 @@ static const char * const mem_cgroup_events_names[] = { "pgmajfault", }; +static const char * const mem_cgroup_lru_names[] = { + "inactive_anon", + "active_anon", + "inactive_file", + "active_file", + "unevictable", +}; + /* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremated by the number of pages. This counter is used for @@ -144,8 +155,13 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* css_id of the last scanned hierarchy member */ - int position; + /* + * last scanned hierarchy member. Valid only if last_dead_count + * matches memcg->dead_count of the hierarchy root group. + */ + struct mem_cgroup *last_visited; + unsigned long last_dead_count; + /* scan generation, increased every round-trip */ unsigned int generation; }; @@ -171,10 +187,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -248,45 +260,19 @@ struct mem_cgroup { */ struct res_counter res; - union { - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; + /* vmpressure notifications */ + struct vmpressure vmpressure; - /* - * rcu_freeing is used only when freeing struct mem_cgroup, - * so put it into a union to avoid wasting more memory. - * It must be disjoint from the css field. It could be - * in a union with the res field, but res plays a much - * larger part in mem_cgroup life than memsw, and might - * be of interest, even at time of free, when debugging. - * So share rcu_head with the less interesting memsw. - */ - struct rcu_head rcu_freeing; - /* - * We also need some space for a worker in deferred freeing. - * By the time we call it, rcu_freeing is no longer in use. - */ - struct work_struct work_freeing; - }; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* * the counter to account for kernel memory usage. */ struct res_counter kmem; /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. - */ - struct mem_cgroup_lru_info info; - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; @@ -295,8 +281,6 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; - atomic_t refcnt; - int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -338,6 +322,7 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif @@ -349,8 +334,24 @@ struct mem_cgroup { /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif + + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; +#endif + + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ }; +static size_t memcg_size(void) +{ + return sizeof(struct mem_cgroup) + + nr_node_ids * sizeof(struct mem_cgroup_per_node); +} + /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -385,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { + /* + * Our caller must use css_get() first, because memcg_uncharge_kmem() + * will call css_put() if it sees the memcg is dead. + */ + smp_wmb(); if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); } @@ -398,8 +404,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) /* Stuffs for move charges at task migration. */ /* - * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a - * left-shifted bitmap of these types. + * Types of charges to be moved. "move_charge_at_immitgrate" and + * "immigrate_flags" are treated as a left-shifted bitmap of these types. */ enum move_type { MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ @@ -412,6 +418,7 @@ static struct move_charge_struct { spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; + unsigned long immigrate_flags; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; @@ -424,14 +431,12 @@ static struct move_charge_struct { static bool move_anon(void) { - return test_bit(MOVE_CHARGE_TYPE_ANON, - &mc.to->move_charge_at_immigrate); + return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); } static bool move_file(void) { - return test_bit(MOVE_CHARGE_TYPE_FILE, - &mc.to->move_charge_at_immigrate); + return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); } /* @@ -471,8 +476,12 @@ enum res_type { #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -static void mem_cgroup_get(struct mem_cgroup *memcg); -static void mem_cgroup_put(struct mem_cgroup *memcg); +/* + * The memcg_create_mutex will be held whenever a new cgroup is created. + * As a consequence, any change that needs to protect against new child cgroups + * appearing has to hold it as well. + */ +static DEFINE_MUTEX(memcg_create_mutex); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) @@ -480,6 +489,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) return container_of(s, struct mem_cgroup, css); } +/* Some nice accessors for the vmpressure. */ +struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + if (!memcg) + memcg = root_mem_cgroup; + return &memcg->vmpressure; +} + +struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +{ + return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; +} + +struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) +{ + return &mem_cgroup_from_css(css)->vmpressure; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -506,15 +533,15 @@ void sock_update_memcg(struct sock *sk) */ if (sk->sk_cgrp) { BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - mem_cgroup_get(sk->sk_cgrp->memcg); + css_get(&sk->sk_cgrp->memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { - mem_cgroup_get(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -528,7 +555,7 @@ void sock_release_memcg(struct sock *sk) struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; - mem_cgroup_put(memcg); + css_put(&sk->sk_cgrp->memcg->css); } } @@ -627,7 +654,8 @@ static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { - return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; + VM_BUG_ON((unsigned)nid >= nr_node_ids); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) @@ -863,6 +891,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, + struct page *page, bool anon, int nr_pages) { preempt_disable(); @@ -878,6 +907,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + if (PageTransHuge(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + nr_pages); + /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); @@ -1042,6 +1075,103 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +/* + * Returns a next (in a pre-order walk) alive memcg (with elevated css + * ref. count) or NULL if the whole root's subtree has been visited. + * + * helper function to be used by mem_cgroup_iter + */ +static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, + struct mem_cgroup *last_visited) +{ + struct cgroup *prev_cgroup, *next_cgroup; + + /* + * Root is not visited by cgroup iterators so it needs an + * explicit visit. + */ + if (!last_visited) + return root; + + prev_cgroup = (last_visited == root) ? NULL + : last_visited->css.cgroup; +skip_node: + next_cgroup = cgroup_next_descendant_pre( + prev_cgroup, root->css.cgroup); + + /* + * Even if we found a group we have to make sure it is + * alive. css && !memcg means that the groups should be + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. + */ + if (next_cgroup) { + struct mem_cgroup *mem = mem_cgroup_from_cont( + next_cgroup); + if (css_tryget(&mem->css)) + return mem; + else { + prev_cgroup = next_cgroup; + goto skip_node; + } + } + + return NULL; +} + +static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) +{ + /* + * When a group in the hierarchy below root is destroyed, the + * hierarchy iterator can no longer be trusted since it might + * have pointed to the destroyed group. Invalidate it. + */ + atomic_inc(&root->dead_count); +} + +static struct mem_cgroup * +mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *root, + int *sequence) +{ + struct mem_cgroup *position = NULL; + /* + * A cgroup destruction happens in two stages: offlining and + * release. They are separated by a RCU grace period. + * + * If the iterator is valid, we may still race with an + * offlining. The RCU lock ensures the object won't be + * released, tryget will fail if we lost the race. + */ + *sequence = atomic_read(&root->dead_count); + if (iter->last_dead_count == *sequence) { + smp_rmb(); + position = iter->last_visited; + if (position && !css_tryget(&position->css)) + position = NULL; + } + return position; +} + +static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *last_visited, + struct mem_cgroup *new_position, + int sequence) +{ + if (last_visited) + css_put(&last_visited->css); + /* + * We store the sequence count from the time @last_visited was + * loaded successfully instead of rereading it here so that we + * don't lose destruction events in between. We could have + * raced with the destruction of @new_position after all. + */ + iter->last_visited = new_position; + smp_wmb(); + iter->last_dead_count = sequence; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1064,7 +1194,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *last_visited = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1073,20 +1203,18 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - id = css_id(&prev->css); - - if (prev && prev != root) - css_put(&prev->css); + last_visited = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - return NULL; + goto out_css_put; return root; } + rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css; + int uninitialized_var(seq); if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1095,31 +1223,34 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) - return NULL; - id = iter->position; + if (prev && reclaim->generation != iter->generation) { + iter->last_visited = NULL; + goto out_unlock; + } + + last_visited = mem_cgroup_iter_load(iter, root, &seq); } - rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); - if (css) { - if (css == &root->css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - iter->position = id; - if (!css) + mem_cgroup_iter_update(iter, last_visited, memcg, seq); + + if (!memcg) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; } - if (prev && !css) - return NULL; + if (prev && !memcg) + goto out_unlock; } +out_unlock: + rcu_read_unlock(); +out_css_put: + if (prev && prev != root) + css_put(&prev->css); + return memcg; } @@ -1317,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { @@ -1333,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } if (!curr) - return 0; + return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -1371,17 +1503,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) -{ - unsigned long active; - unsigned long inactive; - - inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); - - return (active > inactive); -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1524,8 +1645,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, spin_unlock_irqrestore(&memcg->move_lock, *flags); } +#define K(x) ((x) << (PAGE_SHIFT-10)) /** - * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. + * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * @@ -1543,8 +1665,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) */ static char memcg_name[PATH_MAX]; int ret; + struct mem_cgroup *iter; + unsigned int i; - if (!memcg || !p) + if (!p) return; rcu_read_lock(); @@ -1563,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) } rcu_read_unlock(); - printk(KERN_INFO "Task in %s killed", memcg_name); + pr_info("Task in %s killed", memcg_name); rcu_read_lock(); ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); @@ -1576,22 +1700,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Continues from above, so we don't need an KERN_ level */ - printk(KERN_CONT " as a result of limit of %s\n", memcg_name); + pr_cont(" as a result of limit of %s\n", memcg_name); done: - printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_FAILCNT)); - printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " - "failcnt %llu\n", + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); - printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + + for_each_mem_cgroup_tree(iter, memcg) { + pr_info("Memory cgroup stats"); + + rcu_read_lock(); + ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); + if (!ret) + pr_cont(" for %s", memcg_name); + rcu_read_unlock(); + pr_cont(":"); + + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + continue; + pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], + K(mem_cgroup_read_stat(iter, i))); + } + + for (i = 0; i < NR_LRU_LISTS; i++) + pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], + K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + + pr_cont("\n"); + } } /* @@ -1646,11 +1793,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct task_struct *chosen = NULL; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } @@ -2256,6 +2403,17 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } +static void __init memcg_stock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct memcg_stock_pcp *stock = + &per_cpu(memcg_stock, cpu); + INIT_WORK(&stock->work, drain_local_stock); + } +} + /* * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. @@ -2762,7 +2920,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, else anon = false; - mem_cgroup_charge_statistics(memcg, anon, nr_pages); + mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); unlock_page_cgroup(pc); /* @@ -2874,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) if (res_counter_uncharge(&memcg->kmem, size)) return; + /* + * Releases a reference taken in kmem_cgroup_css_offline in case + * this last uncharge is racing with the offlining code or it is + * outliving the memcg existence. + * + * The memory barrier imposed by test&clear is paired with the + * explicit one in memcg_kmem_mark_dead(). + */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -2961,6 +3127,8 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } +static void kmem_cache_destroy_work_func(struct work_struct *w); + int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3027,10 +3195,14 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, if (!s->memcg_params) return -ENOMEM; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - } + } else + s->memcg_params->is_root_cache = true; + return 0; } @@ -3055,12 +3227,12 @@ void memcg_release_cache(struct kmem_cache *s) root = s->memcg_params->root_cache; root->memcg_params->memcg_caches[id] = NULL; - mem_cgroup_put(memcg); mutex_lock(&memcg->slab_caches_mutex); list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); + css_put(&memcg->css); out: kfree(s->memcg_params); } @@ -3161,52 +3333,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) -{ - char *name; - struct dentry *dentry; - - rcu_read_lock(); - dentry = rcu_dereference(memcg->css.cgroup->dentry); - rcu_read_unlock(); - - BUG_ON(dentry == NULL); - - name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), dentry->d_name.name); - - return name; -} +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +/* + * Called with memcg_cache_mutex held + */ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, struct kmem_cache *s) { - char *name; struct kmem_cache *new; + static char *tmp_name = NULL; - name = memcg_cache_name(memcg, s); - if (!name) - return NULL; + lockdep_assert_held(&memcg_cache_mutex); + + /* + * kmem_cache_create_memcg duplicates the given name and + * cgroup_name for this name requires RCU context. + * This static temporary buffer is used to prevent from + * pointless shortliving allocation. + */ + if (!tmp_name) { + tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmp_name) + return NULL; + } - new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + rcu_read_lock(); + snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); + rcu_read_unlock(); + + new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; - kfree(name); return new; } -/* - * This lock protects updaters, not readers. We want readers to be as fast as - * they can, and they will either see NULL or a valid cache value. Our model - * allow them to see NULL, in which case the root memcg will be selected. - * - * We need this lock because multiple allocations to the same cache from a non - * will span more than one worker. Only one of them can create the cache. - */ -static DEFINE_MUTEX(memcg_cache_mutex); static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *cachep) { @@ -3219,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&memcg_cache_mutex); new_cachep = cachep->memcg_params->memcg_caches[idx]; - if (new_cachep) + if (new_cachep) { + css_put(&memcg->css); goto out; + } new_cachep = kmem_cache_dup(memcg, cachep); if (new_cachep == NULL) { new_cachep = cachep; + css_put(&memcg->css); goto out; } - mem_cgroup_get(memcg); atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; @@ -3305,8 +3480,6 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) list_for_each_entry(params, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); cachep->memcg_params->dead = true; - INIT_WORK(&cachep->memcg_params->destroy, - kmem_cache_destroy_work_func); schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); @@ -3318,14 +3491,11 @@ static void memcg_create_cache_work_func(struct work_struct *w) cw = container_of(w, struct create_work, work); memcg_create_kmem_cache(cw->memcg, cw->cachep); - /* Drop the reference gotten when we enqueued. */ - css_put(&cw->memcg->css); kfree(cw); } /* * Enqueue the creation of a per-memcg kmem_cache. - * Called with rcu_read_lock. */ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -3333,12 +3503,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, struct create_work *cw; cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); - if (cw == NULL) - return; - - /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) { - kfree(cw); + if (cw == NULL) { + css_put(&memcg->css); return; } @@ -3394,10 +3560,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - rcu_read_unlock(); if (!memcg_can_account_kmem(memcg)) - return cachep; + goto out; idx = memcg_cache_id(memcg); @@ -3406,29 +3571,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { - /* - * If we are in a safe context (can wait, and not in interrupt - * context), we could be be predictable and return right away. - * This would guarantee that the allocation being performed - * already belongs in the new cache. - * - * However, there are some clashes that can arrive from locking. - * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. - */ - memcg_create_cache_enqueue(memcg, cachep); - return cachep; + if (likely(cachep->memcg_params->memcg_caches[idx])) { + cachep = cachep->memcg_params->memcg_caches[idx]; + goto out; } - return cachep->memcg_params->memcg_caches[idx]; + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) + goto out; + rcu_read_unlock(); + + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; +out: + rcu_read_unlock(); + return cachep; } EXPORT_SYMBOL(__memcg_kmem_get_cache); @@ -3453,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) int ret; *_memcg = NULL; + + /* + * Disabling accounting is only relevant for some specific memcg + * internal allocations. Therefore we would initially not have such + * check here, since direct calls to the page allocator that are marked + * with GFP_KMEMCG only happen outside memcg core. We are mostly + * concerned with cache allocations, and by having this test at + * memcg_kmem_get_cache, we are already able to relay the allocation to + * the root cache and bypass the memcg cache altogether. + * + * There is one exception, though: the SLUB allocator does not create + * large order caches, but rather service large kmallocs directly from + * the page allocator. Therefore, the following sequence when backed by + * the SLUB allocator: + * + * memcg_stop_kmem_account(); + * kmalloc(<large_number>) + * memcg_resume_kmem_account(); + * + * would effectively ignore the fact that we should skip accounting, + * since it will drive us directly to this function without passing + * through the cache selector memcg_kmem_get_cache. Such large + * allocations are extremely rare but can happen, for instance, for the + * cache arrays. We bring this test here. + */ + if (!current->mm || current->memcg_kmem_skip_account) + return true; + memcg = try_get_mem_cgroup_from_mm(current->mm); /* @@ -3546,16 +3748,21 @@ void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); struct page_cgroup *pc; + struct mem_cgroup *memcg; int i; if (mem_cgroup_disabled()) return; + + memcg = head_pc->mem_cgroup; for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; - pc->mem_cgroup = head_pc->mem_cgroup; + pc->mem_cgroup = memcg; smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3611,11 +3818,11 @@ static int mem_cgroup_move_account(struct page *page, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, anon, -nr_pages); + mem_cgroup_charge_statistics(from, page, anon, -nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, anon, nr_pages); + mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -3934,8 +4141,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, if (mem_cgroup_disabled()) return NULL; - VM_BUG_ON(PageSwapCache(page)); - if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); @@ -3990,7 +4195,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, break; } - mem_cgroup_charge_statistics(memcg, anon, -nr_pages); + mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); ClearPageCgroupUsed(pc); /* @@ -4003,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, unlock_page_cgroup(pc); /* * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed. + * will never be freed, so it's safe to call css_get(). */ memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { mem_cgroup_swap_statistics(memcg, true); - mem_cgroup_get(memcg); + css_get(&memcg->css); } /* * Migration does not charge the res_counter for the @@ -4031,6 +4236,18 @@ void mem_cgroup_uncharge_page(struct page *page) if (page_mapped(page)) return; VM_BUG_ON(page->mapping && !PageAnon(page)); + /* + * If the page is in swap cache, uncharge should be deferred + * to the swap path, which also properly accounts swap usage + * and handles memcg lifetime. + * + * Note that this check is not stable and reclaim may add the + * page to swap cache at any time after this. However, if the + * page is not in swap cache by the time page->mapcount hits + * 0, there won't be any page table references to the swap + * slot, and reclaim will free it and not actually write the + * page to disk. + */ if (PageSwapCache(page)) return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); @@ -4108,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) /* * record memcg information, if swapout && memcg != NULL, - * mem_cgroup_get() was called in uncharge(). + * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); @@ -4139,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + css_put(&memcg->css); } rcu_read_unlock(); } @@ -4173,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone mem_cgroup_get(to) - * because if the process that has been moved to @to does - * swap-in, the refcount of @to might be decreased to 0. + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - mem_cgroup_get(to); + css_get(&to->css); return 0; } return -EINVAL; @@ -4340,7 +4560,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, false, -1); + mem_cgroup_charge_statistics(memcg, oldpage, false, -1); ClearPageCgroupUsed(pc); } unlock_page_cgroup(pc); @@ -4389,8 +4609,8 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", - pc, pc->flags, pc->mem_cgroup); + pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", + pc, pc->flags, pc->mem_cgroup); } } #endif @@ -4717,6 +4937,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } /* + * This mainly exists for tests during the setting of set of use_hierarchy. + * Since this is the very setting we are changing, the current hierarchy value + * is meaningless + */ +static inline bool __memcg_has_children(struct mem_cgroup *memcg) +{ + struct cgroup *pos; + + /* bounce at first found */ + cgroup_for_each_child(pos, memcg->css.cgroup) + return true; + return false; +} + +/* + * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed + * to be already dead (as in mem_cgroup_force_empty, for instance). This is + * from mem_cgroup_count_children(), in the sense that we don't really care how + * many children we have; we only need to know if we have any. It also counts + * any memcg without hierarchy as infertile. + */ +static inline bool memcg_has_children(struct mem_cgroup *memcg) +{ + return memcg->use_hierarchy && __memcg_has_children(memcg); +} + +/* * Reclaims as many pages from the given memcg as possible and moves * the rest to the parent. * @@ -4786,7 +5033,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, if (parent) parent_memcg = mem_cgroup_from_cont(parent); - cgroup_lock(); + mutex_lock(&memcg_create_mutex); if (memcg->use_hierarchy == val) goto out; @@ -4801,7 +5048,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (list_empty(&cont->children)) + if (!__memcg_has_children(memcg)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -4809,7 +5056,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, retval = -EINVAL; out: - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return retval; } @@ -4841,6 +5088,10 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return res_counter_read_u64(&memcg->memsw, RES_USAGE); } + /* + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS + * as well as in MEM_CGROUP_STAT_RSS_HUGE. + */ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); @@ -4863,9 +5114,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (type) { case _MEM: if (name == RES_USAGE) @@ -4894,8 +5142,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - bool must_inc_static_branch = false; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -4908,18 +5154,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) * * After it first became limited, changes in the value of the limit are * of course permitted. - * - * Taking the cgroup_lock is really offensive, but it is so far the only - * way to guarantee that no children will appear. There are plenty of - * other offenders, and they should all go away. Fine grained locking - * is probably the way to go here. When we are fully hierarchical, we - * can also get rid of the use_hierarchy check. */ - cgroup_lock(); + mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { - if (cgroup_task_count(cont) || (memcg->use_hierarchy && - !list_empty(&cont->children))) { + if (cgroup_task_count(cont) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; } @@ -4931,44 +5170,22 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); goto out; } - must_inc_static_branch = true; - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes, so it is unfeasible to migrate them away. We - * need to reference count the memcg because of that. - */ - mem_cgroup_get(memcg); - } else - ret = res_counter_set_limit(&memcg->kmem, val); -out: - mutex_unlock(&set_limit_mutex); - cgroup_unlock(); - - /* - * We are by now familiar with the fact that we can't inc the static - * branch inside cgroup_lock. See disarm functions for details. A - * worker here is overkill, but also wrong: After the limit is set, we - * must start accounting right away. Since this operation can't fail, - * we can safely defer it to here - no rollback will be needed. - * - * The boolean used to control this is also safe, because - * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be - * able to set it to true; - */ - if (must_inc_static_branch) { static_key_slow_inc(&memcg_kmem_enabled_key); /* * setting the active bit after the inc will guarantee no one * starts accounting before all call sites are patched */ memcg_kmem_set_active(memcg); - } - + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + mutex_unlock(&memcg_create_mutex); #endif return ret; } +#ifdef CONFIG_MEMCG_KMEM static int memcg_propagate_kmem(struct mem_cgroup *memcg) { int ret = 0; @@ -4977,7 +5194,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) goto out; memcg->kmem_account_flags = parent->kmem_account_flags; -#ifdef CONFIG_MEMCG_KMEM /* * When that happen, we need to disable the static branch only on those * memcgs that enabled it. To achieve this, we would be forced to @@ -4992,21 +5208,21 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) goto out; /* - * destroy(), called if we fail, will issue static_key_slow_inc() and - * mem_cgroup_put() if kmem is enabled. We have to either call them - * unconditionally, or clear the KMEM_ACTIVE flag. I personally find - * this more consistent, since it always leads to the same destroy path + * __mem_cgroup_free() will issue static_key_slow_dec() because this + * memcg is active already. If the later initialization fails then the + * cgroup core triggers the cleanup so we do not have to do it here. */ - mem_cgroup_get(memcg); static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); + memcg_stop_kmem_account(); ret = memcg_update_cache_sizes(memcg); + memcg_resume_kmem_account(); mutex_unlock(&set_limit_mutex); -#endif out: return ret; } +#endif /* CONFIG_MEMCG_KMEM */ /* * The user of this function is... @@ -5024,9 +5240,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (name) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ @@ -5103,9 +5316,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (name) { case RES_MAX_USAGE: if (type == _MEM) @@ -5146,15 +5356,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; + /* - * We check this value several times in both in can_attach() and - * attach(), so we need cgroup lock to prevent this value from being - * inconsistent. + * No kind of locking is needed in here, because ->can_attach() will + * check this value once in the beginning of the process, and then carry + * on with stale data. This means that changes to this value will only + * affect task migrations starting after the change. */ - cgroup_lock(); memcg->move_charge_at_immigrate = val; - cgroup_unlock(); - return 0; } #else @@ -5212,14 +5421,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, } #endif /* CONFIG_NUMA */ -static const char * const mem_cgroup_lru_names[] = { - "inactive_anon", - "active_anon", - "inactive_file", - "active_file", - "unevictable", -}; - static inline void mem_cgroup_lru_names_not_uptodate(void) { BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -5333,18 +5534,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, parent = mem_cgroup_from_cont(cgrp->parent); - cgroup_lock(); + mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) { - cgroup_unlock(); + if ((parent->use_hierarchy) || memcg_has_children(memcg)) { + mutex_unlock(&memcg_create_mutex); return -EINVAL; } memcg->swappiness = val; - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return 0; } @@ -5670,17 +5870,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, parent = mem_cgroup_from_cont(cgrp->parent); - cgroup_lock(); + mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) { - cgroup_unlock(); + if ((parent->use_hierarchy) || memcg_has_children(memcg)) { + mutex_unlock(&memcg_create_mutex); return -EINVAL; } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return 0; } @@ -5695,25 +5894,45 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return ret; return mem_cgroup_sockets_init(memcg, ss); -}; +} -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) +{ + if (!memcg_kmem_is_active(memcg)) + return; + + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes. As we prevent from taking a reference for every + * such allocation we have to be careful when doing uncharge + * (see memcg_uncharge_kmem) and here during offlining. + * + * The idea is that that only the _last_ uncharge which sees + * the dead memcg will drop the last reference. An additional + * reference is taken here before the group is marked dead + * which is then paired with css_put during uncharge resp. here. + * + * Although this might sound strange as this path is called from + * css_offline() when the referencemight have dropped down to 0 + * and shouldn't be incremented anymore (css_tryget would fail) + * we do not have other options because of the kmem allocations + * lifetime. + */ + css_get(&memcg->css); memcg_kmem_mark_dead(memcg); if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) return; - /* - * Charges already down to 0, undo mem_cgroup_get() done in the charge - * path here, being careful not to race with memcg_uncharge_kmem: it is - * possible that the charges went down to 0 between mark_dead and the - * res_counter read, so in that case, we don't need the put - */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -5721,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { } #endif @@ -5768,6 +5991,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "use_hierarchy", + .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, @@ -5789,39 +6013,17 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, + { + .name = "pressure_level", + .register_event = vmpressure_register_event, + .unregister_event = vmpressure_unregister_event, + }, #ifdef CONFIG_NUMA { .name = "numa_stat", .read_seq_string = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_SWAP - { - .name = "memsw.usage_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, - }, - { - .name = "memsw.max_usage_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, - }, - { - .name = "memsw.limit_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write_string = mem_cgroup_write, - .read = mem_cgroup_read, - }, - { - .name = "memsw.failcnt", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, - }, -#endif #ifdef CONFIG_MEMCG_KMEM { .name = "kmem.limit_in_bytes", @@ -5856,6 +6058,36 @@ static struct cftype mem_cgroup_files[] = { { }, /* terminate */ }; +#ifdef CONFIG_MEMCG_SWAP +static struct cftype memsw_cgroup_files[] = { + { + .name = "memsw.usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), + .read = mem_cgroup_read, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, + }, + { + .name = "memsw.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "memsw.limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "memsw.failcnt", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { }, /* terminate */ +}; +#endif static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -5882,21 +6114,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) mz->on_tree = false; mz->memcg = memcg; } - memcg->info.nodeinfo[node] = pn; + memcg->nodeinfo[node] = pn; return 0; } static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->info.nodeinfo[node]); + kfree(memcg->nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - int size = sizeof(struct mem_cgroup); + size_t size = memcg_size(); - /* Can be very big if MAX_NUMNODES is very big */ + /* Can be very big if nr_node_ids is very big */ if (size < PAGE_SIZE) memcg = kzalloc(size, GFP_KERNEL); else @@ -5933,7 +6165,7 @@ out_free: static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - int size = sizeof(struct mem_cgroup); + size_t size = memcg_size(); mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); @@ -5961,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) vfree(memcg); } - -/* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. - */ -static void free_work(struct work_struct *work) -{ - struct mem_cgroup *memcg; - - memcg = container_of(work, struct mem_cgroup, work_freeing); - __mem_cgroup_free(memcg); -} - -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} - -static void mem_cgroup_get(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->refcnt); -} - -static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) -{ - if (atomic_sub_and_test(count, &memcg->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - call_rcu(&memcg->rcu_freeing, free_rcu); - if (parent) - mem_cgroup_put(parent); - } -} - -static void mem_cgroup_put(struct mem_cgroup *memcg) -{ - __mem_cgroup_put(memcg, 1); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6015,19 +6204,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -#ifdef CONFIG_MEMCG_SWAP -static void __init enable_swap_cgroup(void) -{ - if (!mem_cgroup_disabled() && really_do_swap_account) - do_swap_account = 1; -} -#else -static void __init enable_swap_cgroup(void) -{ -} -#endif - -static int mem_cgroup_soft_limit_tree_init(void) +static void __init mem_cgroup_soft_limit_tree_init(void) { struct mem_cgroup_tree_per_node *rtpn; struct mem_cgroup_tree_per_zone *rtpz; @@ -6038,8 +6215,7 @@ static int mem_cgroup_soft_limit_tree_init(void) if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - if (!rtpn) - goto err_cleanup; + BUG_ON(!rtpn); soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -6049,23 +6225,12 @@ static int mem_cgroup_soft_limit_tree_init(void) spin_lock_init(&rtpz->lock); } } - return 0; - -err_cleanup: - for_each_node(node) { - if (!soft_limit_tree.rb_tree_per_node[node]) - break; - kfree(soft_limit_tree.rb_tree_per_node[node]); - soft_limit_tree.rb_tree_per_node[node] = NULL; - } - return 1; - } static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup *cont) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg; long error = -ENOMEM; int node; @@ -6079,35 +6244,52 @@ mem_cgroup_css_alloc(struct cgroup *cont) /* root ? */ if (cont->parent == NULL) { - int cpu; - enable_swap_cgroup(); - parent = NULL; - if (mem_cgroup_soft_limit_tree_init()) - goto free_out; root_mem_cgroup = memcg; - for_each_possible_cpu(cpu) { - struct memcg_stock_pcp *stock = - &per_cpu(memcg_stock, cpu); - INIT_WORK(&stock->work, drain_local_stock); - } - } else { - parent = mem_cgroup_from_cont(cont->parent); - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; + res_counter_init(&memcg->res, NULL); + res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); } - if (parent && parent->use_hierarchy) { + memcg->last_scanned_node = MAX_NUMNODES; + INIT_LIST_HEAD(&memcg->oom_notify); + memcg->move_charge_at_immigrate = 0; + mutex_init(&memcg->thresholds_lock); + spin_lock_init(&memcg->move_lock); + vmpressure_init(&memcg->vmpressure); + + return &memcg->css; + +free_out: + __mem_cgroup_free(memcg); + return ERR_PTR(error); +} + +static int +mem_cgroup_css_online(struct cgroup *cont) +{ + struct mem_cgroup *memcg, *parent; + int error = 0; + + if (!cont->parent) + return 0; + + mutex_lock(&memcg_create_mutex); + memcg = mem_cgroup_from_cont(cont); + parent = mem_cgroup_from_cont(cont->parent); + + memcg->use_hierarchy = parent->use_hierarchy; + memcg->oom_kill_disable = parent->oom_kill_disable; + memcg->swappiness = mem_cgroup_swappiness(parent); + + if (parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); res_counter_init(&memcg->kmem, &parent->kmem); /* - * We increment refcnt of the parent to ensure that we can - * safely access it on res_counter_charge/uncharge. - * This refcnt will be decremented when freeing this - * mem_cgroup(see mem_cgroup_put). + * No need to take a reference to the parent because cgroup + * core guarantees its existence. */ - mem_cgroup_get(parent); } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); @@ -6117,39 +6299,40 @@ mem_cgroup_css_alloc(struct cgroup *cont) * much sense so let cgroup subsystem know about this * unfortunate state in our controller. */ - if (parent && parent != root_mem_cgroup) + if (parent != root_mem_cgroup) mem_cgroup_subsys.broken_hierarchy = true; } - memcg->last_scanned_node = MAX_NUMNODES; - INIT_LIST_HEAD(&memcg->oom_notify); - - if (parent) - memcg->swappiness = mem_cgroup_swappiness(parent); - atomic_set(&memcg->refcnt, 1); - memcg->move_charge_at_immigrate = 0; - mutex_init(&memcg->thresholds_lock); - spin_lock_init(&memcg->move_lock); error = memcg_init_kmem(memcg, &mem_cgroup_subsys); - if (error) { - /* - * We call put now because our (and parent's) refcnts - * are already in place. mem_cgroup_put() will internally - * call __mem_cgroup_free, so return directly - */ - mem_cgroup_put(memcg); - return ERR_PTR(error); - } - return &memcg->css; -free_out: - __mem_cgroup_free(memcg); - return ERR_PTR(error); + mutex_unlock(&memcg_create_mutex); + return error; +} + +/* + * Announce all parents that a group from their hierarchy is gone. + */ +static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = memcg; + + while ((parent = parent_mem_cgroup(parent))) + mem_cgroup_iter_invalidate(parent); + + /* + * if the root memcg is not hierarchical we have to check it + * explicitely. + */ + if (!root_mem_cgroup->use_hierarchy) + mem_cgroup_iter_invalidate(root_mem_cgroup); } static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_css_offline(memcg); + + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); } @@ -6158,9 +6341,8 @@ static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(memcg); - - mem_cgroup_put(memcg); + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU @@ -6279,7 +6461,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(&swapper_space, ent.val); + page = find_get_page(swap_address_space(ent), ent.val); if (do_swap_account) entry->val = ent.val; @@ -6320,7 +6502,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; @@ -6469,6 +6651,7 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; + int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -6489,7 +6672,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, PAGE_SIZE * mc.moved_swap); - __mem_cgroup_put(mc.from, mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); if (!mem_cgroup_is_root(mc.to)) { /* @@ -6499,7 +6684,7 @@ static void __mem_cgroup_clear_mc(void) res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); } - /* we've already done mem_cgroup_get(mc.to) */ + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); @@ -6530,8 +6715,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + unsigned long move_charge_at_immigrate; - if (memcg->move_charge_at_immigrate) { + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_charge_at_immigrate = memcg->move_charge_at_immigrate; + if (move_charge_at_immigrate) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -6551,6 +6743,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, spin_lock(&mc.lock); mc.from = from; mc.to = memcg; + mc.immigrate_flags = move_charge_at_immigrate; spin_unlock(&mc.lock); /* We set mc.moving_task later */ @@ -6741,33 +6934,37 @@ static void mem_cgroup_move_task(struct cgroup *cont, } #endif +/* + * Cgroup retains root cgroups across [un]mount cycles making it necessary + * to verify sane_behavior flag on each mount attempt. + */ +static void mem_cgroup_bind(struct cgroup *root) +{ + /* + * use_hierarchy is forced with sane_behavior. cgroup core + * guarantees that @root doesn't have any children, so turning it + * on for the root memcg is enough. + */ + if (cgroup_sane_behavior(root)) + mem_cgroup_from_cont(root)->use_hierarchy = true; +} + struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", .subsys_id = mem_cgroup_subsys_id, .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, + .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, }; -/* - * The rest of init is performed during ->css_alloc() for root css which - * happens before initcalls. hotcpu_notifier() can't be done together as - * it would introduce circular locking by adding cgroup_lock -> cpu hotplug - * dependency. Do it from a subsys_initcall(). - */ -static int __init mem_cgroup_init(void) -{ - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); - return 0; -} -subsys_initcall(mem_cgroup_init); - #ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { @@ -6780,4 +6977,39 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static void __init memsw_file_init(void) +{ + WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); +} + +static void __init enable_swap_cgroup(void) +{ + if (!mem_cgroup_disabled() && really_do_swap_account) { + do_swap_account = 1; + memsw_file_init(); + } +} + +#else +static void __init enable_swap_cgroup(void) +{ +} #endif + +/* + * subsys_initcall() for memory controller. + * + * Some parts like hotcpu_notifier() have to be initialized from this context + * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically + * everything that doesn't depend on a specific mem_cgroup structure should + * be initialized from here. + */ +static int __init mem_cgroup_init(void) +{ + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + enable_swap_cgroup(); + mem_cgroup_soft_limit_tree_init(); + memcg_stock_init(); + return 0; +} +subsys_initcall(mem_cgroup_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c6e4dd3e1c08..2c13aa7a0164 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; int sysctl_memory_failure_recovery __read_mostly = 1; -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -784,11 +784,11 @@ static struct page_state { { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, - { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, - { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, - { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, + + { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, + { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, @@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page *hpage; int res; unsigned int nr_pages; + unsigned long page_flags; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_trans_order(hpage); - atomic_long_add(nr_pages, &mce_bad_pages); + /* + * Currently errors on hugetlbfs pages are measured in hugepage units, + * so nr_pages should be 1 << compound_order. OTOH when errors are on + * transparent hugepages, they are supposed to be split and error + * measurement is done in normal page units. So nr_pages should be one + * in this case. + */ + if (PageHuge(p)) + nr_pages = 1 << compound_order(hpage); + else /* normal page or thp */ + nr_pages = 1; + atomic_long_add(nr_pages, &num_poisoned_pages); /* * We need/can do nothing about count=0 pages. @@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageHWPoison(hpage) || (hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); return 0; } set_page_hwpoison_huge_page(hpage); @@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); /* + * We use page flags to determine what action should be taken, but + * the flags can be modified by the error containment action. One + * example is an mlocked page, where PG_mlocked is cleared by + * page_remove_rmap() in try_to_unmap_one(). So to determine page status + * correctly, we save a copy of the page flags at this time. + */ + page_flags = p->flags; + + /* * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { @@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); unlock_page(hpage); put_page(hpage); return 0; @@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } res = -EBUSY; - for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { - res = page_action(ps, p, pfn); + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flagss is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) break; - } - } + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + res = page_action(ps, p, pfn); out: unlock_page(hpage); return res; @@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn) */ if (TestClearPageHWPoison(page)) { pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); @@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) * that is not free, and 1 for any other page type. * For 1 the page is returned with increased page count, otherwise not. */ -static int get_any_page(struct page *p, unsigned long pfn, int flags) +static int __get_any_page(struct page *p, unsigned long pfn, int flags) { int ret; @@ -1383,7 +1410,8 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) /* * Isolate the page, so that it doesn't get reallocated if it - * was free. + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. */ set_migratetype_isolate(p, true); /* @@ -1393,11 +1421,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) if (!get_page_unless_zero(compound_head(p))) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); - ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + ret = 0; } else if (is_free_buddy_page(p)) { pr_info("%s: %#lx free buddy page\n", __func__, pfn); - /* Set hwpoison bit while page is still isolated */ - SetPageHWPoison(p); ret = 0; } else { pr_info("%s: %#lx: unknown zero refcount page type %lx\n", @@ -1408,48 +1434,71 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p, MIGRATE_MOVABLE); unlock_memory_hotplug(); return ret; } +static int get_any_page(struct page *page, unsigned long pfn, int flags) +{ + int ret = __get_any_page(page, pfn, flags); + + if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = __get_any_page(page, pfn, 0); + if (!PageLRU(page)) { + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + } + return ret; +} + static int soft_offline_huge_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - + /* + * This double-check of PageHWPoison is to avoid the race with + * memory_failure(). See also comment in __soft_offline_page(). + */ + lock_page(hpage); if (PageHWPoison(hpage)) { + unlock_page(hpage); put_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } + unlock_page(hpage); /* Keep page count to indicate a given hugepage is isolated. */ - ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, + ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, MIGRATE_SYNC); put_page(hpage); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); - return ret; - } -done: - if (!PageHWPoison(hpage)) + } else { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); atomic_long_add(1 << compound_trans_order(hpage), - &mce_bad_pages); - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - /* keep elevated page count for bad page */ + &num_poisoned_pages); + } return ret; } +static int __soft_offline_page(struct page *page, int flags); + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1478,9 +1527,11 @@ int soft_offline_page(struct page *page, int flags) unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_trans_head(page); - if (PageHuge(page)) - return soft_offline_huge_page(page, flags); - if (PageTransHuge(hpage)) { + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { pr_info("soft offline: %#lx: failed to split THP\n", pfn); @@ -1491,47 +1542,45 @@ int soft_offline_page(struct page *page, int flags) ret = get_any_page(page, pfn, flags); if (ret < 0) return ret; - if (ret == 0) - goto done; - - /* - * Page cache page we can handle? - */ - if (!PageLRU(page)) { - /* - * Try to free it. - */ - put_page(page); - shake_page(page, 1); - - /* - * Did it turn free? - */ - ret = get_any_page(page, pfn, 0); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - } - if (!PageLRU(page)) { - pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); - return -EIO; + if (ret) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_trans_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } } + unset_migratetype_isolate(page, MIGRATE_MOVABLE); + return ret; +} - lock_page(page); - wait_on_page_writeback(page); +static int __soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); /* - * Synchronized using the page lock with memory_failure() + * Check PageHWPoison again inside page lock because PageHWPoison + * is set by memory_failure() outside page lock. Note that + * memory_failure() also double-checks PageHWPoison inside page lock, + * so there's no race between soft_offline_page() and memory_failure(). */ + lock_page(page); + wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } - /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. @@ -1544,9 +1593,10 @@ int soft_offline_page(struct page *page, int flags) */ if (ret == 1) { put_page(page); - ret = 0; pr_info("soft_offline: %#lx: invalidated\n", pfn); - goto done; + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + return 0; } /* @@ -1563,28 +1613,38 @@ int soft_offline_page(struct page *page, int flags) if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC, - MR_MEMORY_FAILURE); + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; + } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); + SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); + atomic_long_inc(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } - if (ret) - return ret; - -done: - atomic_long_add(1, &mce_bad_pages); - SetPageHWPoison(page); - /* keep elevated page count for bad page */ return ret; } diff --git a/mm/memory.c b/mm/memory.c index e0a9b0ce4f10..1ce2e2a734fc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,6 +69,10 @@ #include "internal.h" +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -78,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr); EXPORT_SYMBOL(mem_map); #endif -unsigned long num_physpages; /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end @@ -88,7 +91,6 @@ unsigned long num_physpages; */ void * high_memory; -EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); /* @@ -184,10 +186,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -208,14 +214,15 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->mm = mm; tlb->fullmm = fullmm; + tlb->need_flush_all = 0; tlb->start = -1UL; tlb->end = 0; tlb->need_flush = 0; - tlb->fast_mode = (num_possible_cpus() == 1); tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; @@ -234,9 +241,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb_table_flush(tlb); #endif - if (tlb_fast_mode(tlb)) - return; - for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; @@ -278,11 +282,6 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) VM_BUG_ON(!tlb->need_flush); - if (tlb_fast_mode(tlb)) { - free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu() */ - } - batch = tlb->active; batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { @@ -705,13 +704,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) - print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", - (unsigned long)vma->vm_ops->fault); + printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", + vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", - (unsigned long)vma->vm_file->f_op->mmap); + printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", + vma->vm_file->f_op->mmap); dump_stack(); - add_taint(TAINT_BAD_PAGE); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static inline bool is_cow_mapping(vm_flags_t flags) @@ -1100,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1150,7 +1150,7 @@ again: if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && - likely(!VM_SequentialReadHint(vma))) + likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } @@ -1205,12 +1205,14 @@ again: force_flush = 0; #ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = addr; - tlb->end = end; + tlb->start = range_start; + tlb->end = addr; #endif tlb_flush_mmu(tlb); - if (addr != end) + if (addr != end) { + range_start = addr; goto again; + } } return addr; @@ -1453,10 +1455,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, EXPORT_SYMBOL_GPL(zap_vma_ptes); /** - * follow_page - look up a page descriptor from a user-virtual address + * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * @@ -1464,8 +1467,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { pgd_t *pgd; pud_t *pud; @@ -1475,6 +1479,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct page *page; struct mm_struct *mm = vma->vm_mm; + *page_mask = 0; + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); @@ -1521,6 +1527,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(&mm->page_table_lock); + *page_mask = HPAGE_PMD_NR - 1; goto out; } } else @@ -1534,8 +1541,24 @@ split_fallthrough: ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) - goto no_page; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) @@ -1668,15 +1691,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { - int i; + long i; unsigned long vm_flags; + unsigned int page_mask; - if (nr_pages <= 0) + if (!nr_pages) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); @@ -1752,6 +1776,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); + page_mask = 0; goto next_page; } @@ -1769,6 +1794,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *page; unsigned int foll_flags = gup_flags; + unsigned int page_increm; /* * If we have a pending SIGKILL, don't keep faulting @@ -1778,7 +1804,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -ERESTARTSYS; cond_resched(); - while (!(page = follow_page(vma, start, foll_flags))) { + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { int ret; unsigned int fault_flags = 0; @@ -1852,13 +1879,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); + page_mask = 0; } next_page: - if (vmas) + if (vmas) { vmas[i] = vma; - i++; - start += PAGE_SIZE; - nr_pages--; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; @@ -1972,9 +2005,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * * See also get_user_pages_fast, for performance critical applications. */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) { int flags = FOLL_TOUCH; @@ -2352,6 +2385,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) @@ -2825,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, details->first_index, details->last_index) { vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + vma_pages(vma) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) @@ -2914,7 +2994,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page, *swapcache = NULL; + struct page *page, *swapcache; swp_entry_t entry; pte_t pte; int locked; @@ -2965,9 +3045,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + swapcache = page; goto out_release; } + swapcache = page; locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2985,16 +3067,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - if (ksm_might_need_to_copy(page, vma, address)) { - swapcache = page; - page = ksm_does_need_to_copy(page, vma, address); - - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - swapcache = NULL; - goto out_page; - } + page = ksm_might_need_to_copy(page, vma, address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { @@ -3039,7 +3116,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - do_page_add_anon_rmap(page, vma, address, exclusive); + if (page == swapcache) + do_page_add_anon_rmap(page, vma, address, exclusive); + else /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, address); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -3047,7 +3127,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); - if (swapcache) { + if (page != swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3080,7 +3160,7 @@ out_page: unlock_page(page); out_release: page_cache_release(page); - if (swapcache) { + if (page != swapcache) { unlock_page(swapcache); page_cache_release(swapcache); } @@ -3156,6 +3236,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ __SetPageUptodate(page); if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) @@ -3706,6 +3791,14 @@ retry: if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; + /* + * If the pmd is splitting, return and retry the + * the fault. Alternative: wait until the split + * is done, and goto retry. + */ + if (pmd_trans_splitting(orig_pmd)) + return 0; + if (pmd_numa(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); @@ -3808,30 +3901,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -ENOMEM; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); - len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -EFAULT; -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) @@ -4133,7 +4202,7 @@ void print_vma_addr(char *prefix, unsigned long ip) up_read(&mm->mmap_sem); } -#ifdef CONFIG_PROVE_LOCKING +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void might_fault(void) { /* @@ -4145,13 +4214,17 @@ void might_fault(void) if (segment_eq(get_fs(), KERNEL_DS)) return; - might_sleep(); /* * it would be nicer only to annotate paths which are not under * pagefault_disable, however that requires a larger audit and * providing helpers like get_user_atomic. */ - if (!in_atomic() && current->mm) + if (in_atomic()) + return; + + __might_sleep(__FILE__, __LINE__, 0); + + if (current->mm) might_lock_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL(might_fault); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d04ed87bfacb..ca1dd3aa5eee 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -29,6 +29,7 @@ #include <linux/suspend.h> #include <linux/mm_inline.h> #include <linux/firmware-map.h> +#include <linux/stop_machine.h> #include <asm/tlbflush.h> @@ -74,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->end = start + size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %pR cannot be added\n", res); + pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); res = NULL; } @@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) +void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { page->lru.next = (struct list_head *) type; SetPagePrivate(page); @@ -101,12 +101,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, atomic_inc(&page->_count); } -/* reference to __meminit __free_pages_bootmem is valid - * so use __ref to tell modpost not to generate a warning */ -void __ref put_page_bootmem(struct page *page) +void put_page_bootmem(struct page *page) { unsigned long type; - static DEFINE_MUTEX(ppb_lock); type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,18 +113,12 @@ void __ref put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - - /* - * Please refer to comment for __free_pages_bootmem() - * for why we serialize here. - */ - mutex_lock(&ppb_lock); - __free_pages_bootmem(page, 0); - mutex_unlock(&ppb_lock); + free_reserved_page(page); } - } +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +#ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; @@ -161,6 +152,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void register_page_bootmem_info_node(struct pglist_data *pgdat) { @@ -189,21 +206,21 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } pfn = pgdat->node_start_pfn; - end_pfn = pfn + pgdat->node_spanned_pages; + end_pfn = pgdat_end_pfn(pgdat); - /* register_section info */ + /* register section info */ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { /* * Some platforms can assign the same pfn to multiple nodes - on * node0 as well as nodeN. To avoid registering a pfn against * multiple nodes we check that this pfn does not already - * reside in some other node. + * reside in some other nodes. */ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); } } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) @@ -253,6 +270,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, set_page_links(pfn_to_page(pfn), zid, nid, pfn); } +/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or + * alloc_bootmem_node_nopanic() */ +static int __ref ensure_zone_is_initialized(struct zone *zone, + unsigned long start_pfn, unsigned long num_pages) +{ + if (!zone_is_initialized(zone)) + return init_currently_empty_zone(zone, start_pfn, num_pages, + MEMMAP_HOTPLUG); + return 0; +} + static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { @@ -260,19 +288,16 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long flags; unsigned long z1_start_pfn; - if (!z1->wait_table) { - ret = init_currently_empty_zone(z1, start_pfn, - end_pfn - start_pfn, MEMMAP_HOTPLUG); - if (ret) - return ret; - } + ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; pgdat_resize_lock(z1->zone_pgdat, &flags); /* can't move pfns which are higher than @z2 */ - if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) + if (end_pfn > zone_end_pfn(z2)) goto out_fail; - /* the move out part mast at the left most of @z2 */ + /* the move out part must be at the left most of @z2 */ if (start_pfn > z2->zone_start_pfn) goto out_fail; /* must included/overlap */ @@ -286,7 +311,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, z1_start_pfn = start_pfn; resize_zone(z1, z1_start_pfn, end_pfn); - resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); + resize_zone(z2, end_pfn, zone_end_pfn(z2)); pgdat_resize_unlock(z1->zone_pgdat, &flags); @@ -305,12 +330,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, unsigned long flags; unsigned long z2_end_pfn; - if (!z2->wait_table) { - ret = init_currently_empty_zone(z2, start_pfn, - end_pfn - start_pfn, MEMMAP_HOTPLUG); - if (ret) - return ret; - } + ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -318,15 +340,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, if (z1->zone_start_pfn > start_pfn) goto out_fail; /* the move out part mast at the right most of @z1 */ - if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) + if (zone_end_pfn(z1) > end_pfn) goto out_fail; /* must included/overlap */ - if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) + if (start_pfn >= zone_end_pfn(z1)) goto out_fail; /* use end_pfn for z2's end_pfn if z2 is empty */ if (z2->spanned_pages) - z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; + z2_end_pfn = zone_end_pfn(z2); else z2_end_pfn = end_pfn; @@ -363,16 +385,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nid = pgdat->node_id; int zone_type; unsigned long flags; + int ret; zone_type = zone - pgdat->node_zones; - if (!zone->wait_table) { - int ret; + ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); + if (ret) + return ret; - ret = init_currently_empty_zone(zone, phys_start_pfn, - nr_pages, MEMMAP_HOTPLUG); - if (ret) - return ret; - } pgdat_resize_lock(zone->zone_pgdat, &flags); grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, @@ -405,36 +424,6 @@ static int __meminit __add_section(int nid, struct zone *zone, return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __remove_section(struct zone *zone, struct mem_section *ms) -{ - /* - * XXX: Freeing memmap with vmemmap is not implement yet. - * This should be removed later. - */ - return -EBUSY; -} -#else -static int __remove_section(struct zone *zone, struct mem_section *ms) -{ - unsigned long flags; - struct pglist_data *pgdat = zone->zone_pgdat; - int ret = -EINVAL; - - if (!valid_section(ms)) - return ret; - - ret = unregister_memory_section(ms); - if (ret) - return ret; - - pgdat_resize_lock(pgdat, &flags); - sparse_remove_one_section(zone, ms); - pgdat_resize_unlock(pgdat, &flags); - return 0; -} -#endif - /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will @@ -468,6 +457,229 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__add_pages); +#ifdef CONFIG_MEMORY_HOTREMOVE +/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ +static int find_smallest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(start_pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(start_pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(start_pfn))) + continue; + + return start_pfn; + } + + return 0; +} + +/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ +static int find_biggest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + unsigned long pfn; + + /* pfn is the end pfn of a memory section. */ + pfn = end_pfn - 1; + for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(pfn))) + continue; + + return pfn; + } + + return 0; +} + +static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long pfn; + struct mem_section *ms; + int nid = zone_to_nid(zone); + + zone_span_writelock(zone); + if (zone_start_pfn == start_pfn) { + /* + * If the section is smallest section in the zone, it need + * shrink zone->zone_start_pfn and zone->zone_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, zone, end_pfn, + zone_end_pfn); + if (pfn) { + zone->zone_start_pfn = pfn; + zone->spanned_pages = zone_end_pfn - pfn; + } + } else if (zone_end_pfn == end_pfn) { + /* + * If the section is biggest section in the zone, it need + * shrink zone->spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, + start_pfn); + if (pfn) + zone->spanned_pages = pfn - zone_start_pfn + 1; + } + + /* + * The section is not biggest or smallest mem_section in the zone, it + * only creates a hole in the zone. So in this case, we need not + * change the zone. But perhaps, the zone has only hole data. Thus + * it check the zone has only hole or not. + */ + pfn = zone_start_pfn; + for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (page_zone(pfn_to_page(pfn)) != zone) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + zone_span_writeunlock(zone); + return; + } + + /* The zone has no valid section */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + zone_span_writeunlock(zone); +} + +static void shrink_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + struct mem_section *ms; + int nid = pgdat->node_id; + + if (pgdat_start_pfn == start_pfn) { + /* + * If the section is smallest section in the pgdat, it need + * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, NULL, end_pfn, + pgdat_end_pfn); + if (pfn) { + pgdat->node_start_pfn = pfn; + pgdat->node_spanned_pages = pgdat_end_pfn - pfn; + } + } else if (pgdat_end_pfn == end_pfn) { + /* + * If the section is biggest section in the pgdat, it need + * shrink pgdat->node_spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, + start_pfn); + if (pfn) + pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; + } + + /* + * If the section is not biggest or smallest mem_section in the pgdat, + * it only creates a hole in the pgdat. So in this case, we need not + * change the pgdat. + * But perhaps, the pgdat has only hole data. Thus it check the pgdat + * has only hole or not. + */ + pfn = pgdat_start_pfn; + for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + return; + } + + /* The pgdat has no valid section */ + pgdat->node_start_pfn = 0; + pgdat->node_spanned_pages = 0; +} + +static void __remove_zone(struct zone *zone, unsigned long start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int zone_type; + unsigned long flags; + + zone_type = zone - pgdat->node_zones; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); + shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + unsigned long start_pfn; + int scn_nr; + int ret = -EINVAL; + + if (!valid_section(ms)) + return ret; + + ret = unregister_memory_section(ms); + if (ret) + return ret; + + scn_nr = __section_nr(ms); + start_pfn = section_nr_to_pfn(scn_nr); + __remove_zone(zone, start_pfn); + + sparse_remove_one_section(zone, ms); + return 0; +} + /** * __remove_pages() - remove sections of pages from a zone * @zone: zone from which pages need to be removed @@ -482,8 +694,10 @@ EXPORT_SYMBOL_GPL(__add_pages); int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { - unsigned long i, ret = 0; + unsigned long i; int sections_to_remove; + resource_size_t start, size; + int ret = 0; /* * We can only remove entire sections @@ -491,7 +705,15 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { @@ -503,6 +725,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, return ret; } EXPORT_SYMBOL_GPL(__remove_pages); +#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { @@ -540,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); void __online_page_set_limits(struct page *page) { - unsigned long pfn = page_to_pfn(page); - - if (pfn >= num_physpages) - num_physpages = pfn + 1; } EXPORT_SYMBOL_GPL(__online_page_set_limits); void __online_page_increment_counters(struct page *page) { - totalram_pages++; - -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages++; -#endif + adjust_managed_page_count(page, 1); } EXPORT_SYMBOL_GPL(__online_page_increment_counters); void __online_page_free(struct page *page) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } EXPORT_SYMBOL_GPL(__online_page_free); @@ -683,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { + unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; @@ -701,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && !can_online_high_movable(zone)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } @@ -759,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ return ret; } - zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + if (onlined_pages) { node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) @@ -797,11 +1013,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = start >> PAGE_SHIFT; - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; + pgdat = NODE_DATA(nid); + if (!pgdat) { + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; - arch_refresh_nodedata(nid, pgdat); + arch_refresh_nodedata(nid, pgdat); + } /* we can use NODE_DATA(nid) from here */ @@ -854,7 +1073,8 @@ out: int __ref add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; - int new_pgdat = 0; + bool new_pgdat; + bool new_node; struct resource *res; int ret; @@ -865,12 +1085,16 @@ int __ref add_memory(int nid, u64 start, u64 size) if (!res) goto out; - if (!node_online(nid)) { + { /* Stupid hack to suppress address-never-null warning */ + void *p = NODE_DATA(nid); + new_pgdat = !p; + } + new_node = !node_online(nid); + if (new_node) { pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; if (!pgdat) goto error; - new_pgdat = 1; } /* call arch's memory hotadd */ @@ -882,7 +1106,7 @@ int __ref add_memory(int nid, u64 start, u64 size) /* we online node here. we can't roll back from here. */ node_set_online(nid); - if (new_pgdat) { + if (new_node) { ret = register_one_node(nid); /* * If sysfs file of new node can't create, cpu on the node @@ -901,8 +1125,7 @@ error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - if (res) - release_memory_resource(res); + release_memory_resource(res); out: unlock_memory_hotplug(); @@ -1058,8 +1281,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC, - MR_MEMORY_HOTPLUG); + MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_lru_pages(&source); } @@ -1246,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long pfn, nr_pages, expire; long offlined_pages; int ret, drain, retry_max, node; + unsigned long flags; struct zone *zone; struct memory_notify arg; @@ -1337,10 +1560,12 @@ repeat: /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ - zone->managed_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; - totalram_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); @@ -1380,18 +1605,28 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); } +#endif /* CONFIG_MEMORY_HOTREMOVE */ -int remove_memory(u64 start, u64 size) +/** + * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) + * @start_pfn: start pfn of the memory range + * @end_pfn: end pfn of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present mem sections in range + * [start_pfn, end_pfn) and call func on each mem section. + * + * Returns the return value of func. + */ +int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, + void *arg, int (*func)(struct memory_block *, void *)) { struct memory_block *mem = NULL; struct mem_section *section; - unsigned long start_pfn, end_pfn; unsigned long pfn, section_nr; int ret; - start_pfn = PFN_DOWN(start); - end_pfn = start_pfn + PFN_DOWN(size); - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { section_nr = pfn_to_section_nr(pfn); if (!present_section_nr(section_nr)) @@ -1408,7 +1643,7 @@ int remove_memory(u64 start, u64 size) if (!mem) continue; - ret = offline_memory_block(mem); + ret = func(mem, arg); if (ret) { kobject_put(&mem->dev.kobj); return ret; @@ -1420,14 +1655,159 @@ int remove_memory(u64 start, u64 size) return 0; } -#else -int offline_pages(unsigned long start_pfn, unsigned long nr_pages) + +#ifdef CONFIG_MEMORY_HOTREMOVE +static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) { - return -EINVAL; + int ret = !is_memblock_offlined(mem); + + if (unlikely(ret)) { + phys_addr_t beginpa, endpa; + + beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); + endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + pr_warn("removing memory fails, because memory " + "[%pa-%pa] is onlined\n", + &beginpa, &endpa); + } + + return ret; } -int remove_memory(u64 start, u64 size) + +static int check_cpu_on_node(void *data) { - return -EINVAL; + struct pglist_data *pgdat = data; + int cpu; + + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == pgdat->node_id) + /* + * the cpu on this node isn't removed, and we can't + * offline this node. + */ + return -EBUSY; + } + + return 0; +} + +static void unmap_cpu_on_node(void *data) +{ +#ifdef CONFIG_ACPI_NUMA + struct pglist_data *pgdat = data; + int cpu; + + for_each_possible_cpu(cpu) + if (cpu_to_node(cpu) == pgdat->node_id) + numa_clear_node(cpu); +#endif +} + +static int check_and_unmap_cpu_on_node(void *data) +{ + int ret = check_cpu_on_node(data); + + if (ret) + return ret; + + /* + * the node will be offlined when we come here, so we can clear + * the cpu_to_node() now. + */ + + unmap_cpu_on_node(data); + return 0; +} + +/* offline the node if all memory sections of this node are removed */ +void try_offline_node(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + struct page *pgdat_page = virt_to_page(pgdat); + int i; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!present_section_nr(section_nr)) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* + * some memory sections of this node are not removed, and we + * can't offline node now. + */ + return; + } + + if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) + return; + + /* + * all memory/cpu of this node are removed, we can offline this + * node now. + */ + node_set_offline(nid); + unregister_one_node(nid); + + if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) + /* node data is allocated from boot memory */ + return; + + /* free waittable in each zone */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + /* + * wait_table may be allocated from boot memory, + * here only free if it's allocated by vmalloc. + */ + if (is_vmalloc_addr(zone->wait_table)) + vfree(zone->wait_table); + } + + /* + * Since there is no way to guarentee the address of pgdat/zone is not + * on stack of any kernel threads or used by other kernel objects + * without reference counting or other symchronizing method, do not + * reset node_data and free pgdat here. Just reset it to 0 and reuse + * the memory when the node is online again. + */ + memset(pgdat, 0, sizeof(*pgdat)); +} +EXPORT_SYMBOL(try_offline_node); + +void __ref remove_memory(int nid, u64 start, u64 size) +{ + int ret; + + lock_memory_hotplug(); + + /* + * All memory blocks must be offlined before removing memory. Check + * whether all memory blocks in question are offline and trigger a BUG() + * if this is not the case. + */ + ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, + is_memblock_offlined_cb); + if (ret) { + unlock_memory_hotplug(); + BUG(); + } + + /* remove memmap entry */ + firmware_map_remove(start, start + size, "System RAM"); + + arch_remove_memory(start, size); + + try_offline_node(nid); + + unlock_memory_hotplug(); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ EXPORT_SYMBOL_GPL(remove_memory); +#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d1b315e98627..74310017296e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -26,7 +26,7 @@ * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. - * As a special case node -1 here means do the allocation + * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. @@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p) if (!pol) { node = numa_node_id(); - if (node != -1) + if (node != NUMA_NO_NODE) pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ @@ -161,19 +161,7 @@ static const struct mempolicy_operations { /* Check that the nodemask contains at least one populated zone */ static int is_valid_nodemask(const nodemask_t *nodemask) { - int nd, k; - - for_each_node_mask(nd, *nodemask) { - struct zone *z; - - for (k = 0; k <= policy_zone; k++) { - z = &NODE_DATA(nd)->node_zones[k]; - if (z->present_pages > 0) - return 1; - } - } - - return 0; + return nodes_intersects(*nodemask, node_states[N_MEMORY]); } static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); + mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) @@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* * vm_normal_page() filters out zero pages, but there might * still be PageReserved pages to skip, perhaps in a VDSO. - * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC, - MR_SYSCALL); + MIGRATE_SYNC, MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len, pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : -1); + nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { @@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - false, MIGRATE_SYNC, - MR_MEMPOLICY_MBIND); + (unsigned long)vma, + MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_lru_pages(&pagelist); } @@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +{ + enum zone_type dynamic_policy_zone = policy_zone; + + BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); + + /* + * if policy->v.nodes has movable memory only, + * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. + * + * policy->v.nodes is intersect with node_states[N_MEMORY]. + * so if the following test faile, it implies + * policy->v.nodes has movable memory only. + */ + if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) + dynamic_policy_zone = ZONE_MOVABLE; + + return zone >= dynamic_policy_zone; +} + /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation @@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && - gfp_zone(gfp) >= policy_zone && + apply_policy_zone(policy, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) return &policy->v.nodes; @@ -2132,7 +2137,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->mutex */ +/* Caller holds sp->lock */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2196,13 +2201,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - mutex_lock(&sp->mutex); + spin_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); return pol; } @@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_xchg_last_nid(page, polnid); + last_nid = page_nid_xchg_last(page, polnid); if (last_nid != polnid) goto out; } @@ -2328,6 +2333,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) sp_free(n); } +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; +} + static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { @@ -2344,10 +2357,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, return NULL; } newpol->flags |= MPOL_F_SHARED; - - n->start = start; - n->end = end; - n->policy = newpol; + sp_node_init(n, start, end, newpol); return n; } @@ -2357,9 +2367,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; int ret = 0; - mutex_lock(&sp->mutex); +restart: + spin_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2372,14 +2385,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } else { /* Old policy spanning whole new range. */ if (n->end > end) { - struct sp_node *new2; - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) { - ret = -ENOMEM; - goto out; - } + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, end, n->end, mpol_new); n->end = start; - sp_insert(sp, new2); + sp_insert(sp, n_new); + n_new = NULL; + mpol_new = NULL; break; } else n->end = start; @@ -2390,9 +2405,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } if (new) sp_insert(sp, new); -out: - mutex_unlock(&sp->mutex); + spin_unlock(&sp->lock); + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + return ret; + +alloc_new: + spin_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + goto restart; } /** @@ -2410,7 +2443,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - mutex_init(&sp->mutex); + spin_lock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2455,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info, vma->vm_pgoff, sz, npol ? npol->mode : -1, npol ? npol->flags : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -2476,14 +2509,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - mutex_lock(&p->mutex); + spin_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - mutex_unlock(&p->mutex); + spin_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING @@ -2595,8 +2628,7 @@ void numa_default_policy(void) */ /* - * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ static const char * const policy_modes[] = { @@ -2610,28 +2642,20 @@ static const char * const policy_modes[] = #ifdef CONFIG_TMPFS /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @no_context: flag whether to "contextualize" the mempolicy * * Format of input: * <mode>[=<flags>][:<nodelist>] * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy. This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time. Used to parse tmpfs mpol - * mount option. Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved. Saving - * it again is redundant, but safe. - * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode; - unsigned short uninitialized_var(mode_flags); + unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); @@ -2719,24 +2743,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } else { - int ret; - NODEMASK_SCRATCH(scratch); - if (scratch) { - task_lock(current); - ret = mpol_set_nodemask(new, &nodes, scratch); - task_unlock(current); - } else - ret = -ENOMEM; - NODEMASK_SCRATCH_FREE(scratch); - if (ret) { - mpol_put(new); - goto out; - } - } + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) + new->v.nodes = nodes; + else if (nodelist) + new->v.preferred_node = first_node(nodes); + else + new->flags |= MPOL_F_LOCAL; + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + err = 0; out: @@ -2756,13 +2779,12 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; int l; @@ -2788,7 +2810,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_PREFERRED: nodes_clear(nodes); if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; /* pseudo-policy */ + mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; @@ -2796,10 +2818,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - if (no_context) - nodes = pol->w.user_nodemask; - else - nodes = pol->v.nodes; + nodes = pol->v.nodes; break; default: diff --git a/mm/migrate.c b/mm/migrate.c index 3b676b0c5c3e..6f0c24438bba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -160,10 +160,12 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); #ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(new)) + if (PageHuge(new)) { pte = pte_mkhuge(pte); + pte = arch_make_huge_pte(pte, vma, new, 0); + } #endif - flush_cache_page(vma, addr, pte_pfn(pte)); + flush_dcache_page(new); set_pte_at(mm, addr, ptep, pte); if (PageHuge(new)) { @@ -198,15 +200,14 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { - pte_t *ptep, pte; - spinlock_t *ptl; + pte_t pte; swp_entry_t entry; struct page *page; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto out; @@ -234,6 +235,20 @@ out: pte_unmap_unlock(ptep, ptl); } +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = &(mm)->page_table_lock; + __migration_entry_wait(mm, pte, ptl); +} + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, @@ -462,7 +477,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); - + /* + * Please do not reorder this without considering how mm/ksm.c's + * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + */ ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -696,7 +714,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, enum migrate_mode mode) + int force, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -726,26 +744,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, lock_page(page); } - /* - * Only memory hotplug's offline_pages() caller has locked out KSM, - * and can safely migrate a KSM page. The other cases have skipped - * PageKsm along with PageReserved - but it is only now when we have - * the page lock that we can be certain it will not go KSM beneath us - * (KSM will not upgrade a page from PageAnon to PageKsm when it sees - * its pagecount raised, but only here do we take the page lock which - * serializes that). - */ - if (PageKsm(page) && !offlining) { - rc = -EBUSY; - goto unlock; - } - /* charge against new page */ mem_cgroup_prepare_migration(page, newpage, &mem); if (PageWriteback(page)) { /* - * Only in the case of a full syncronous migration is it + * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much @@ -766,7 +770,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ - if (PageAnon(page)) { + if (PageAnon(page) && !PageKsm(page)) { /* * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. @@ -846,7 +850,6 @@ uncharge: mem_cgroup_end_migration(mem, page, newpage, (rc == MIGRATEPAGE_SUCCESS || rc == MIGRATEPAGE_BALLOON_SUCCESS)); -unlock: unlock_page(page); out: return rc; @@ -857,8 +860,7 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, - enum migrate_mode mode) + struct page *page, int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -876,7 +878,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, mode); + rc = __unmap_and_move(page, newpage, force, mode); if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { /* @@ -936,8 +938,7 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, - enum migrate_mode mode) + int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -985,23 +986,26 @@ out: } /* - * migrate_pages + * migrate_pages - migrate the pages specified in a list, to the free pages + * supplied as the target for the page migration * - * The function takes one list of pages to migrate and a function - * that determines from the page to be migrated and the private data - * the target of the move and allocates the page. + * @from: The list of pages to be migrated. + * @get_new_page: The function used to allocate free pages to be used + * as the target of the page migration. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * page migration, if any. + * @reason: The reason for page migration. * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * Caller should call putback_lru_pages to return pages to the LRU + * The function returns after 10 attempts or if no pages are movable any more + * because the list has become empty or no retryable pages exist any more. + * The caller should call putback_lru_pages() to return pages to the LRU * or free list only if ret != 0. * - * Return: Number of pages not migrated or error code. + * Returns the number of pages that were not migrated, or an error code. */ -int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode, int reason) +int migrate_pages(struct list_head *from, new_page_t get_new_page, + unsigned long private, enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1022,8 +1026,7 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining, - mode); + page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1056,15 +1059,13 @@ out: } int migrate_huge_page(struct page *hpage, new_page_t get_new_page, - unsigned long private, bool offlining, - enum migrate_mode mode) + unsigned long private, enum migrate_mode mode) { int pass, rc; for (pass = 0; pass < 10; pass++) { - rc = unmap_and_move_huge_page(get_new_page, - private, hpage, pass > 2, offlining, - mode); + rc = unmap_and_move_huge_page(get_new_page, private, + hpage, pass > 2, mode); switch (rc) { case -ENOMEM: goto out; @@ -1150,7 +1151,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto set_status; /* Use PageReserved to check for zero page */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) goto put_and_set; pp->page = page; @@ -1187,8 +1188,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, MIGRATE_SYNC, - MR_SYSCALL); + (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1312,7 +1312,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, err = -ENOENT; /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page) || PageKsm(page)) + if (!page || PageReserved(page)) goto set_status; err = page_to_nid(page); @@ -1459,7 +1459,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, * pages. Currently it only checks the watermarks which crude */ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, - int nr_migrate_pages) + unsigned long nr_migrate_pages) { int z; for (z = pgdat->nr_zones - 1; z >= 0; z--) { @@ -1495,7 +1495,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_xchg_last_nid(newpage, page_last_nid(page)); + page_nid_xchg_last(newpage, page_nid_last(page)); return newpage; } @@ -1555,39 +1555,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { - int ret = 0; + int page_lru; + + VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(pgdat, 1)) { - int page_lru; + if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + return 0; - if (isolate_lru_page(page)) { - put_page(page); - return 0; - } + if (isolate_lru_page(page)) + return 0; - /* Page is isolated */ - ret = 1; - page_lru = page_is_file_cache(page); - if (!PageTransHuge(page)) - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - else - mod_zone_page_state(page_zone(page), - NR_ISOLATED_ANON + page_lru, - HPAGE_PMD_NR); + /* + * migrate_misplaced_transhuge_page() skips page migration's usual + * check on page_count(), so we must do it here, now that the page + * has been isolated: a GUP pin, or any other pin, prevents migration. + * The expected page count is 3: 1 for page's mapcount and 1 for the + * caller's pin and 1 for the reference taken by isolate_lru_page(). + */ + if (PageTransHuge(page) && page_count(page) != 3) { + putback_lru_page(page); + return 0; } + page_lru = page_is_file_cache(page); + mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + hpage_nr_pages(page)); + /* - * Page is either isolated or there is not enough space on the target - * node. If isolated, then it has taken a reference count and the - * callers reference can be safely dropped without the page - * disappearing underneath us during migration. Otherwise the page is - * not to be migrated but the callers reference should still be - * dropped so it does not leak. + * Isolating the page has taken another reference, so the + * caller's reference can be safely dropped without the page + * disappearing underneath us during migration. */ put_page(page); - - return ret; + return 1; } /* @@ -1598,7 +1599,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int migrate_misplaced_page(struct page *page, int node) { pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; + int isolated; int nr_remaining; LIST_HEAD(migratepages); @@ -1606,42 +1607,43 @@ int migrate_misplaced_page(struct page *page, int node) * Don't migrate pages that are mapped in multiple processes. * TODO: Handle false sharing detection instead of this hammer */ - if (page_mapcount(page) != 1) { - put_page(page); + if (page_mapcount(page) != 1) goto out; - } /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - if (numamigrate_update_ratelimit(pgdat, 1)) { - put_page(page); + if (numamigrate_update_ratelimit(pgdat, 1)) goto out; - } isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, - alloc_misplaced_dst_page, - node, false, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { putback_lru_pages(&migratepages); isolated = 0; } else count_vm_numa_event(NUMA_PAGE_MIGRATE); BUG_ON(!list_empty(&migratepages)); -out: return isolated; + +out: + put_page(page); + return 0; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * Migrates a THP to a given target node. page must be locked and is unlocked + * before returning. + */ int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry, @@ -1672,17 +1674,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, new_page = alloc_pages_node(node, (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); - if (!new_page) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - goto out_dropref; - } - page_xchg_last_nid(new_page, page_last_nid(page)); + if (!new_page) + goto out_fail; + + page_nid_xchg_last(new_page, page_nid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); put_page(new_page); - goto out_keep_locked; + goto out_fail; } /* Prepare a page as a migration target */ @@ -1714,6 +1714,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, putback_lru_page(page); count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + isolated = 0; goto out; } @@ -1758,9 +1759,11 @@ out: -HPAGE_PMD_NR); return isolated; +out_fail: + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + unlock_page(page); put_page(page); -out_keep_locked: return 0; } #endif /* CONFIG_NUMA_BALANCING */ diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..da2be56a7b8f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif if (page) { @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), + pgoff); #else WARN_ON(1); *vec = 1; diff --git a/mm/mlock.c b/mm/mlock.c index f0b9ce572fc7..79b7cf7d1bca 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -102,13 +102,16 @@ void mlock_vma_page(struct page *page) * can't isolate the page, we leave it for putback_lru_page() and vmscan * [page_referenced()/try_to_unmap()] to deal with. */ -void munlock_vma_page(struct page *page) +unsigned int munlock_vma_page(struct page *page) { + unsigned int page_mask = 0; + BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - -hpage_nr_pages(page)); + unsigned int nr_pages = hpage_nr_pages(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + page_mask = nr_pages - 1; if (!isolate_lru_page(page)) { int ret = SWAP_AGAIN; @@ -141,6 +144,8 @@ void munlock_vma_page(struct page *page) count_vm_event(UNEVICTABLE_PGMUNLOCKED); } } + + return page_mask; } /** @@ -155,13 +160,11 @@ void munlock_vma_page(struct page *page) * * vma->vm_mm->mmap_sem must be held for at least read. */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int *nonblocking) +long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start; - int nr_pages = (end - start) / PAGE_SIZE; + unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -186,7 +189,11 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE; - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, NULL, NULL, nonblocking); } @@ -202,56 +209,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -/** - * mlock_vma_pages_range() - mlock pages in specified vma range. - * @vma - the vma containing the specfied address range - * @start - starting address in @vma to mlock - * @end - end address [+1] in @vma to mlock - * - * For mmap()/mremap()/expansion of mlocked vma. - * - * return 0 on success for "normal" vmas. - * - * return number of pages [> 0] to be removed from locked_vm on success - * of "special" vmas. - */ -long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - int nr_pages = (end - start) / PAGE_SIZE; - BUG_ON(!(vma->vm_flags & VM_LOCKED)); - - /* - * filter unlockable vmas - */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - goto no_mlock; - - if (!((vma->vm_flags & VM_DONTEXPAND) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) { - - __mlock_vma_pages_range(vma, start, end, NULL); - - /* Hide errors from mmap() and other callers */ - return 0; - } - - /* - * User mapped kernel pages or huge pages: - * make these pages present to populate the ptes, but - * fall thru' to reset VM_LOCKED--no need to unlock, and - * return nr_pages so these don't get counted against task's - * locked limit. huge pages are already counted against - * locked vm limit. - */ - make_pages_present(start, end); - -no_mlock: - vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ - return nr_pages; /* error or pages NOT mlocked */ -} - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -273,13 +230,12 @@ no_mlock: void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long addr; - - lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - for (addr = start; addr < end; addr += PAGE_SIZE) { + while (start < end) { struct page *page; + unsigned int page_mask, page_increm; + /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -287,13 +243,22 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * suits munlock very well (and if somehow an abnormal page * has sneaked into the range, we won't oops here: great). */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, + &page_mask); if (page && !IS_ERR(page)) { lock_page(page); - munlock_vma_page(page); + lru_add_drain(); + /* + * Any THP page found by follow_page_mask() may have + * gotten split before reaching munlock_vma_page(), + * so we need to recompute the page_mask here. + */ + page_mask = munlock_vma_page(page); unlock_page(page); put_page(page); } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + start += page_increm * PAGE_SIZE; cond_resched(); } } @@ -303,7 +268,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and - * populate the ptes via make_pages_present(). + * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ @@ -391,9 +356,9 @@ static int do_mlock(unsigned long start, size_t len, int on) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vma->vm_flags | VM_LOCKED; - if (!on) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (on) + newflags |= VM_LOCKED; tmp = vma->vm_end; if (tmp > end) @@ -416,13 +381,20 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; - int ret = 0; + long ret = 0; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -498,7 +470,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); if (!error) - error = do_mlock_pages(start, len, 0); + error = __mm_populate(start, len, 0); return error; } @@ -517,20 +489,20 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) static int do_mlockall(int flags) { struct vm_area_struct * vma, * prev = NULL; - unsigned int def_flags = 0; if (flags & MCL_FUTURE) - def_flags = VM_LOCKED; - current->mm->def_flags = def_flags; + current->mm->def_flags |= VM_LOCKED; + else + current->mm->def_flags &= ~VM_LOCKED; if (flags == MCL_FUTURE) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags | VM_LOCKED; - if (!(flags & MCL_CURRENT)) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (flags & MCL_CURRENT) + newflags |= VM_LOCKED; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); @@ -564,10 +536,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); - if (!ret && (flags & MCL_CURRENT)) { - /* Ignore errors */ - do_mlock_pages(0, TASK_SIZE, 1); - } + if (!ret && (flags & MCL_CURRENT)) + mm_populate(0, TASK_SIZE); out: return ret; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 1ffd97ae26d7..633c08863fd8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -9,6 +9,8 @@ #include <linux/init.h> #include <linux/kobject.h> #include <linux/export.h> +#include <linux/memory.h> +#include <linux/notifier.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -69,34 +71,41 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Flags %d\n", + "Section %d Node %d Zone %d Lastnid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, + LAST_NID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d\n", + "Section %d Node %d Zone %d Lastnid %d\n", SECTIONS_SHIFT, NODES_SHIFT, - ZONES_SHIFT); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", - "Section %lu Node %lu Zone %lu\n", + ZONES_SHIFT, + LAST_NID_SHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", + "Section %lu Node %lu Zone %lu Lastnid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, - (unsigned long)ZONES_PGSHIFT); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", - "Zone ID: %lu -> %lu\n", - (unsigned long)ZONEID_PGOFF, - (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); + (unsigned long)ZONES_PGSHIFT, + (unsigned long)LAST_NID_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", + "Node/Zone ID: %lu -> %lu\n", + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), + (unsigned long)ZONEID_PGOFF); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", - "location: %d -> %d unused %d -> %d flags %d -> %d\n", + "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); #ifdef NODE_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Last nid not in page flags"); +#endif if (SECTIONS_WIDTH) { shift -= SECTIONS_WIDTH; @@ -140,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel); struct kobject *mm_kobj; EXPORT_SYMBOL_GPL(mm_kobj); +#ifdef CONFIG_SMP +s32 vm_committed_as_batch = 32; + +static void __meminit mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr*2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + + vm_committed_as_batch = max_t(s32, memsized_batch, batch); +} + +static int __meminit mm_compute_batch_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + mm_compute_batch(); + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block compute_batch_nb __meminitdata = { + .notifier_call = mm_compute_batch_notifier, + .priority = IPC_CALLBACK_PRI, /* use lowest priority */ +}; + +static int __init mm_compute_batch_init(void) +{ + mm_compute_batch(); + register_hotmemory_notifier(&compute_batch_nb); + + return 0; +} + +__initcall(mm_compute_batch_init); + +#endif + static int __init mm_sysfs_init(void) { mm_kobj = kobject_create_and_add("mm", kernel_kobj); diff --git a/mm/mmap.c b/mm/mmap.c index f54b235f29a9..fbad7b091090 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,6 +6,7 @@ * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> @@ -32,6 +33,9 @@ #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/rbtree_augmented.h> +#include <linux/sched/sysctl.h> +#include <linux/notifier.h> +#include <linux/memory.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -83,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -121,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -143,7 +149,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the @@ -162,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -176,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -202,7 +211,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_path.dentry->d_inode->i_writecount); + atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable--; @@ -255,6 +264,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; + bool populate; down_write(&mm->mmap_sem); @@ -304,8 +314,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; + set_brk: mm->brk = brk; + populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; + up_write(&mm->mmap_sem); + if (populate) + mm_populate(oldbrk, newbrk - oldbrk); + return brk; + out: retval = mm->brk; up_write(&mm->mmap_sem); @@ -534,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, return 0; } +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) +{ + unsigned long nr_pages = 0; + struct vm_area_struct *vma; + + /* Find first overlaping mapping */ + vma = find_vma_intersection(mm, addr, end); + if (!vma) + return 0; + + nr_pages = (min(end, vma->vm_end) - + max(addr, vma->vm_start)) >> PAGE_SHIFT; + + /* Iterate over the rest of the overlaps */ + for (vma = vma->vm_next; vma; vma = vma->vm_next) { + unsigned long overlap_len; + + if (vma->vm_start > end) + break; + + overlap_len = min(end, vma->vm_end) - vma->vm_start; + nr_pages += overlap_len >> PAGE_SHIFT; + } + + return nr_pages; +} + void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { @@ -567,7 +612,7 @@ static void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_path.dentry->d_inode->i_writecount); + atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable++; @@ -800,7 +845,7 @@ again: remove_next = 1 + (end > next->vm_end); anon_vma_interval_tree_post_update_vma(vma); if (adjust_next) anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -820,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end); if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; - mpol_put(vma_policy(next)); + vma_set_policy(vma, vma_policy(next)); kmem_cache_free(vm_area_cachep, next); /* * In mprotect's case 6 (see comments on vma_merge), @@ -910,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } @@ -1153,12 +1198,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) + unsigned long flags, unsigned long pgoff, + unsigned long *populate) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; + *populate = 0; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1217,7 +1265,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EAGAIN; } - inode = file ? file->f_path.dentry->d_inode : NULL; + inode = file ? file_inode(file) : NULL; if (file) { switch (flags & MAP_TYPE) { @@ -1279,7 +1327,26 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - return mmap_region(file, addr, len, flags, vm_flags, pgoff); + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } + + addr = mmap_region(file, addr, len, vm_flags, pgoff); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = len; + return addr; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, @@ -1291,20 +1358,30 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; file = fget(fd); if (!file) goto out; + if (is_file_hugepages(file)) + len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; + struct hstate *hs; + + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + if (!hs) + return -EINVAL; + + len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); @@ -1315,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: if (file) fput(file); out: @@ -1394,8 +1472,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1403,7 +1480,24 @@ unsigned long mmap_region(struct file *file, unsigned long addr, int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + struct inode *inode = file ? file_inode(file) : NULL; + + /* Check against address space limit. */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + if (!(vm_flags & MAP_FIXED)) + return -ENOMEM; + + nr_pages = count_vma_pages_range(mm, addr, addr + len); + + if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } /* Clear old maps */ error = -ENOMEM; @@ -1414,24 +1508,6 @@ munmap_back: goto munmap_back; } - /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. - */ - if ((flags & MAP_NORESERVE)) { - /* We honor MAP_NORESERVE if allowed to overcommit */ - if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - - /* hugetlb applies strict overcommit unless MAP_NORESERVE */ - if (file && is_file_hugepages(file)) - vm_flags |= VM_NORESERVE; - } - /* * Private writable mapping: check memory availability */ @@ -1530,10 +1606,12 @@ out: vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) + if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); - } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) - make_pages_present(addr, addr + len); + else + vma->vm_flags &= ~VM_LOCKED; + } if (file) uprobe_mmap(vma); @@ -1800,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1865,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -1917,12 +1973,9 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; - if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ - return NULL; - /* Check the cache first. */ /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; + vma = ACCESS_ONCE(mm->mmap_cache); if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { struct rb_node *rb_node; @@ -2169,9 +2222,28 @@ int expand_downwards(struct vm_area_struct *vma, return error; } +/* + * Note how expand_stack() refuses to expand the stack all the way to + * abut the next virtual mapping, *unless* that mapping itself is also + * a stack mapping. We want to leave room for a guard page, after all + * (the guard page itself is not added here, that is done by the + * actual page faulting logic) + * + * This matches the behavior of the guard page logic (see mm/memory.c: + * check_stack_guard_page()), which only allows the guard page to be + * removed under these circumstances. + */ #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { + struct vm_area_struct *next; + + address &= PAGE_MASK; + next = vma->vm_next; + if (next && next->vm_start == address + PAGE_SIZE) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + } return expand_upwards(vma, address); } @@ -2186,14 +2258,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(prev, addr, prev->vm_end); - } + if (prev->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { + struct vm_area_struct *prev; + + address &= PAGE_MASK; + prev = vma->vm_prev; + if (prev && prev->vm_end == address) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + } return expand_downwards(vma, address); } @@ -2214,9 +2293,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(vma, addr, start); - } + if (vma->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(vma, addr, start, NULL); return vma; } #endif @@ -2262,7 +2340,7 @@ static void unmap_region(struct mm_struct *mm, update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : 0); + next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); } @@ -2276,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2293,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } @@ -2589,10 +2661,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) - mm->locked_vm += (len >> PAGE_SHIFT); - } + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); return addr; } @@ -2600,10 +2670,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; unsigned long ret; + bool populate; down_write(&mm->mmap_sem); ret = do_brk(addr, len); + populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); + if (populate) + mm_populate(addr, len); return ret; } EXPORT_SYMBOL(vm_brk); @@ -2640,7 +2714,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); /* @@ -2886,7 +2960,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write(&anon_vma->root->rwsem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares @@ -2943,7 +3017,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -3001,7 +3075,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) BUG(); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } } @@ -3052,3 +3126,115 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long tmp, free_kbytes; + + switch (action) { + case MEM_ONLINE: + /* Default max is 128MB. Leave alone if modified by operator. */ + tmp = sysctl_user_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 17)) + init_user_reserve(); + + /* Default max is 8MB. Leave alone if modified by operator. */ + tmp = sysctl_admin_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 13)) + init_admin_reserve(); + + break; + case MEM_OFFLINE: + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + if (sysctl_user_reserve_kbytes > free_kbytes) { + init_user_reserve(); + pr_info("vm.user_reserve_kbytes reset to %lu\n", + sysctl_user_reserve_kbytes); + } + + if (sysctl_admin_reserve_kbytes > free_kbytes) { + init_admin_reserve(); + pr_info("vm.admin_reserve_kbytes reset to %lu\n", + sysctl_admin_reserve_kbytes); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { + .notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ + if (register_hotmemory_notifier(&reserve_mem_nb)) + printk("Failed registering memory add/remove notifier for admin reserve"); + + return 0; +} +module_init(init_reserve_notifier) diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..8a8cd0265e52 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -14,9 +14,6 @@ * use_mm * Makes the calling kernel thread take on the specified * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. * (Note: this routine is intended to be called only * from a kernel thread context) */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8a5ac8c686b0..93e6089cb456 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -37,7 +37,6 @@ static struct srcu_struct srcu; void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; - struct hlist_node *n; int id; /* @@ -45,13 +44,12 @@ void __mmu_notifier_release(struct mm_struct *mm) * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) /* - * if ->release runs before mmu_notifier_unregister it - * must be handled as it's the only way for the driver - * to flush all existing sptes and stop the driver - * from establishing any more sptes before all the - * pages in the mm are freed. + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. */ if (mn->ops->release) mn->ops->release(mn, mm); @@ -64,22 +62,22 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist); /* * We arrived before mmu_notifier_unregister so - * mmu_notifier_unregister will do nothing other than - * to wait ->release to finish and - * mmu_notifier_unregister to return. + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. */ hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_srcu here prevents mmu_notifier_release to - * return to exit_mmap (which would proceed freeing all pages - * in the mm) until the ->release method returns, if it was - * invoked by mmu_notifier_unregister. + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. * - * The mmu_notifier_mm can't go away from under us because one - * mm_count is hold by exit_mmap. + * The mmu_notifier_mm can't go away from under us because one mm_count + * is held by exit_mmap. */ synchronize_srcu(&srcu); } @@ -93,11 +91,10 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address) { struct mmu_notifier *mn; - struct hlist_node *n; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } @@ -110,11 +107,10 @@ int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { struct mmu_notifier *mn; - struct hlist_node *n; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->test_young) { young = mn->ops->test_young(mn, mm, address); if (young) @@ -130,11 +126,10 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { struct mmu_notifier *mn; - struct hlist_node *n; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); } @@ -145,11 +140,10 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { struct mmu_notifier *mn; - struct hlist_node *n; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } @@ -160,31 +154,31 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { struct mmu_notifier *mn; - struct hlist_node *n; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } srcu_read_unlock(&srcu, id); } +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { struct mmu_notifier *mn; - struct hlist_node *n; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } srcu_read_unlock(&srcu, id); } +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, @@ -296,29 +290,32 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) if (!hlist_unhashed(&mn->hlist)) { /* - * SRCU here will force exit_mmap to wait ->release to finish - * before freeing the pages. + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. */ int id; id = srcu_read_lock(&srcu); /* - * exit_mmap will block in mmu_notifier_release to - * guarantee ->release is called before freeing the - * pages. + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. */ if (mn->ops->release) mn->ops->release(mn, mm); srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); - hlist_del_rcu(&mn->hlist); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); } /* - * Wait any running method to finish, of course including - * ->release if it was run by mmu_notifier_relase instead of us. + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_release instead of us. */ synchronize_srcu(&srcu); diff --git a/mm/mmzone.c b/mm/mmzone.c index 4596d81b89b1..2ac0afbd68f3 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -1,7 +1,7 @@ /* * linux/mm/mmzone.c * - * management codes for pgdats and zones. + * management codes for pgdats, zones and page flags */ @@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); } + +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) +int page_nid_xchg_last(struct page *page, int nid) +{ + unsigned long old_flags, flags; + int last_nid; + + do { + old_flags = flags = page->flags; + last_nid = page_nid_last(page); + + flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); + flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_nid; +} +#endif diff --git a/mm/mremap.c b/mm/mremap.c index e1031e1f6a61..457d34ef3bf2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -19,6 +19,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mmu_notifier.h> +#include <linux/sched/sysctl.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -125,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); } arch_leave_lazy_mmu_mode(); @@ -134,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (anon_vma) - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } @@ -208,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr) + unsigned long new_len, unsigned long new_addr, bool *locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -299,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; - if (new_len > old_len) - mlock_vma_pages_range(new_vma, new_addr + old_len, - new_addr + new_len); + *locked = true; } return new_addr; @@ -366,9 +365,8 @@ Eagain: return ERR_PTR(-EAGAIN); } -static unsigned long mremap_to(unsigned long addr, - unsigned long old_len, unsigned long new_addr, - unsigned long new_len) +static unsigned long mremap_to(unsigned long addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len, bool *locked) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -418,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr, if (ret & ~PAGE_MASK) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); if (!(ret & ~PAGE_MASK)) goto out; out1: @@ -456,14 +454,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; - - down_write(¤t->mm->mmap_sem); + bool locked = false; if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; if (addr & ~PAGE_MASK) - goto out; + return ret; old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len); @@ -474,11 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * a zero new-len is nonsensical. */ if (!new_len) - goto out; + return ret; + + down_write(¤t->mm->mmap_sem); if (flags & MREMAP_FIXED) { - if (flags & MREMAP_MAYMOVE) - ret = mremap_to(addr, old_len, new_addr, new_len); + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked); goto out; } @@ -520,8 +522,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; - mlock_vma_pages_range(vma, addr + old_len, - addr + new_len); + locked = true; + new_addr = addr; } ret = addr; goto out; @@ -547,11 +549,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - ret = move_vma(vma, addr, old_len, new_len, new_addr); + ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); up_write(¤t->mm->mmap_sem); + if (locked && new_len > old_len) + mm_populate(new_addr + old_len, new_len - old_len); return ret; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index b8294fc03df8..61107cf55bb3 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -45,9 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (!addr) return NULL; + memblock_reserve(addr, size); ptr = phys_to_virt(addr); memset(ptr, 0, size); - memblock_reserve(addr, size); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. @@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return end_pfn - start_pfn; } -unsigned long __init free_low_memory_core_early(int nodeid) +static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; phys_addr_t start, end, size; @@ -137,35 +137,25 @@ unsigned long __init free_low_memory_core_early(int nodeid) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +void __init reset_all_zones_managed_pages(void) { - register_page_bootmem_info_node(pgdat); - reset_node_lowmem_managed_pages(pgdat); + struct pglist_data *pgdat; - /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -175,17 +165,19 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - struct pglist_data *pgdat; + unsigned long pages; - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ - return free_low_memory_core_early(MAX_NUMNODES); + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; } /** @@ -406,6 +398,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from diff --git a/mm/nommu.c b/mm/nommu.c index 79c3cac87afa..ecd1f158548e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -29,6 +29,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> +#include <linux/sched/sysctl.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -55,13 +56,14 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; -unsigned long num_physpages; unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -82,7 +84,6 @@ unsigned long vm_memory_committed(void) EXPORT_SYMBOL_GPL(vm_memory_committed); EXPORT_SYMBOL(mem_map); -EXPORT_SYMBOL(num_physpages); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; @@ -139,10 +140,10 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *retry) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -189,9 +190,10 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { int flags = 0; @@ -226,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; +LIST_HEAD(vmap_area_list); void vfree(const void *addr) { @@ -279,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { + /* Don't allow overflow */ + if ((unsigned long) buf + count < count) + count = -(unsigned long) buf; + memcpy(buf, addr, count); return count; } @@ -819,7 +824,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = mm->mmap_cache; + vma = ACCESS_ONCE(mm->mmap_cache); if (vma && vma->vm_start <= addr && vma->vm_end > addr) return vma; @@ -941,7 +946,7 @@ static int validate_mmap_request(struct file *file, */ mapping = file->f_mapping; if (!mapping) - mapping = file->f_path.dentry->d_inode->i_mapping; + mapping = file_inode(file)->i_mapping; capabilities = 0; if (mapping && mapping->backing_dev_info) @@ -950,7 +955,7 @@ static int validate_mmap_request(struct file *file, if (!capabilities) { /* no explicit capabilities set, so assume some * defaults */ - switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { + switch (file_inode(file)->i_mode & S_IFMT) { case S_IFREG: case S_IFBLK: capabilities = BDI_CAP_MAP_COPY; @@ -985,11 +990,11 @@ static int validate_mmap_request(struct file *file, !(file->f_mode & FMODE_WRITE)) return -EACCES; - if (IS_APPEND(file->f_path.dentry->d_inode) && + if (IS_APPEND(file_inode(file)) && (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file->f_path.dentry->d_inode)) + if (locks_verify_locked(file_inode(file))) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) @@ -1249,7 +1254,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff) + unsigned long pgoff, + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; @@ -1259,6 +1265,8 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + *populate = 0; + /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1322,8 +1330,8 @@ unsigned long do_mmap_pgoff(struct file *file, continue; /* search for overlapping mappings on the same file */ - if (pregion->vm_file->f_path.dentry->d_inode != - file->f_path.dentry->d_inode) + if (file_inode(pregion->vm_file) != + file_inode(file)) continue; if (pregion->vm_pgoff >= pgend) @@ -1765,7 +1773,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) * * MREMAP_FIXED is not supported under NOMMU conditions */ -unsigned long do_mremap(unsigned long addr, +static unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { @@ -1800,7 +1808,6 @@ unsigned long do_mremap(unsigned long addr, vma->vm_end = vma->vm_start + new_len; return vma->vm_start; } -EXPORT_SYMBOL(do_mremap); SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, @@ -1814,9 +1821,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { + *page_mask = 0; return NULL; } @@ -1831,6 +1840,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long pfn = start >> PAGE_SHIFT; + unsigned long vm_len = vma->vm_end - vma->vm_start; + + pfn += vma->vm_pgoff; + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { @@ -1852,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) @@ -1881,7 +1896,7 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -1903,7 +1918,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the @@ -1922,10 +1937,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -1935,16 +1950,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = totalram_pages * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some 3% for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -2106,3 +2124,45 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, up_write(&nommu_region_sem); return 0; } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0399f146ae49..79e451a78c9e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(memcg, p); - show_mem(SHOW_MEM_FILTER_NODES); + if (memcg) + mem_cgroup_print_oom_info(memcg, p); + else + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(memcg, nodemask); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0713bfbf0954..4514ad7415c3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -35,6 +35,7 @@ #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> #include <linux/timer.h> +#include <linux/sched/rt.h> #include <trace/events/writeback.h> /* @@ -240,6 +241,9 @@ static unsigned long global_dirtyable_memory(void) if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); + /* Subtract min_free_kbytes */ + x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); + return x + 1; /* Ensure that we never return 0 */ } @@ -692,7 +696,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, * => fast response on large errors; small oscillation near setpoint */ setpoint = (freerun + limit) / 2; - x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, + x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -1982,6 +1986,8 @@ int __set_page_dirty_no_writeback(struct page *page) */ void account_page_dirtied(struct page *page, struct address_space *mapping) { + trace_writeback_dirty_page(page, mapping); + if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); @@ -2289,3 +2295,23 @@ int mapping_tagged(struct address_space *mapping, int tag) return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); + +/** + * wait_for_stable_page() - wait for writeback to finish, if necessary. + * @page: The page to wait on. + * + * This function determines if the given page is related to a backing device + * that requires page contents to be held stable during writeback. If so, then + * it will wait for any pending writeback to complete. + */ +void wait_for_stable_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (!bdi_cap_stable_pages_required(bdi)) + return; + + wait_on_page_writeback(page); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ba5e37127fc..b100255dedda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -58,11 +58,17 @@ #include <linux/prefetch.h> #include <linux/migrate.h> #include <linux/page-debug-flags.h> +#include <linux/hugetlb.h> +#include <linux/sched/rt.h> +#include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -98,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; /* @@ -195,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -221,11 +231,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -/* - * NOTE: - * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. - * Instead, use {un}set_pageblock_isolate. - */ void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -244,15 +249,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) int ret = 0; unsigned seq; unsigned long pfn = page_to_pfn(page); + unsigned long sp, start_pfn; do { seq = zone_span_seqbegin(zone); - if (pfn >= zone->zone_start_pfn + zone->spanned_pages) - ret = 1; - else if (pfn < zone->zone_start_pfn) + start_pfn = zone->zone_start_pfn; + sp = zone->spanned_pages; + if (!zone_spans_pfn(zone, pfn)) ret = 1; } while (zone_span_seqretry(zone, seq)); + if (ret) + pr_err("page %lu outside zone [ %lu - %lu ]\n", + pfn, start_pfn, start_pfn + sp); + return ret; } @@ -292,7 +302,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -324,8 +334,8 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ - add_taint(TAINT_BAD_PAGE); + page_mapcount_reset(page); /* remove PageBuddy */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* @@ -537,6 +547,8 @@ static inline void __free_one_page(struct page *page, unsigned long uninitialized_var(buddy_idx); struct page *buddy; + VM_BUG_ON(!zone_is_initialized(zone)); + if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) return; @@ -610,7 +622,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - reset_page_last_nid(page); + page_nid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -670,7 +682,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { + if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); if (is_migrate_cma(mt)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); @@ -688,7 +700,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); - if (unlikely(migratetype != MIGRATE_ISOLATE)) + if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -735,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * Read access to zone->managed_pages is safe because it's unsigned long, - * but we still need to serialize writers. Currently all callers of - * __free_pages_bootmem() except put_page_bootmem() should only be used - * at boot time. So for shorter boot time, we shift the burden to - * put_page_bootmem() to serialize writers. - */ -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; unsigned int loop; @@ -777,7 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); __free_pages(page, pageblock_order); - totalram_pages += pageblock_nr_pages; + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -916,7 +921,9 @@ static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ +#ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ +#endif }; /* @@ -981,9 +988,9 @@ int move_freepages_block(struct zone *zone, struct page *page, end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ - if (start_pfn < zone->zone_start_pfn) + if (!zone_spans_pfn(zone, start_pfn)) start_page = page; - if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) + if (!zone_spans_pfn(zone, end_pfn)) return 0; return move_freepages(zone, start_page, end_page, migratetype); @@ -1040,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * MIGRATE_CMA areas. */ if (!is_migrate_cma(migratetype) && - (unlikely(current_order >= pageblock_order / 2) || + (current_order >= pageblock_order / 2 || start_migratetype == MIGRATE_RECLAIMABLE || page_group_by_mobility_disabled)) { int pages; @@ -1142,7 +1149,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add_tail(&page->lru, list); if (IS_ENABLED(CONFIG_CMA)) { mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) + if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) mt = migratetype; } set_freepage_migratetype(page, mt); @@ -1169,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1277,7 +1286,7 @@ void mark_free_pages(struct zone *zone) spin_lock_irqsave(&zone->lock, flags); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1326,7 +1335,7 @@ void free_hot_cold_page(struct page *page, int cold) * excessively into the page allocator */ if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(migratetype == MIGRATE_ISOLATE)) { + if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, 0, migratetype); goto out; } @@ -1340,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: @@ -1388,15 +1398,10 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } +EXPORT_SYMBOL_GPL(split_page); -/* - * Similar to the split_page family of functions except that the page - * required at the given order and being isolated now to prevent races - * with parallel allocators - */ -int capture_free_page(struct page *page, int alloc_order, int migratetype) +static int __isolate_free_page(struct page *page, unsigned int order) { - unsigned int order; unsigned long watermark; struct zone *zone; int mt; @@ -1404,16 +1409,15 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) BUG_ON(!PageBuddy(page)); zone = page_zone(page); - order = page_order(page); mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_ISOLATE) { + if (!is_migrate_isolate(mt)) { /* Obey watermarks as if the page was being allocated */ watermark = low_wmark_pages(zone) + (1 << order); if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + __mod_zone_freepage_state(zone, -(1UL << order), mt); } /* Remove page from free list */ @@ -1421,22 +1425,18 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone->free_area[order].nr_free--; rmv_page_order(page); - if (alloc_order != order) - expand(zone, page, alloc_order, order, - &zone->free_area[order], migratetype); - - /* Set the pageblock if the captured page is at least a pageblock */ + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } } - return 1UL << alloc_order; + return 1UL << order; } /* @@ -1454,10 +1454,9 @@ int split_free_page(struct page *page) unsigned int order; int nr_pages; - BUG_ON(!PageBuddy(page)); order = page_order(page); - nr_pages = capture_free_page(page, order, 0); + nr_pages = __isolate_free_page(page, order); if (!nr_pages) return 0; @@ -1629,6 +1628,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; + long free_cma = 0; free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) @@ -1638,9 +1638,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages <= min + lowmem_reserve) + + if (free_pages - free_cma <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1655,20 +1656,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -#ifdef CONFIG_MEMORY_ISOLATION -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - if (unlikely(zone->nr_pageblock_isolate)) - return zone->nr_pageblock_isolate * pageblock_nr_pages; - return 0; -} -#else -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - return 0; -} -#endif - bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1684,14 +1671,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - /* - * If the zone has MIGRATE_ISOLATE type free pages, we should consider - * it. nr_zone_isolate_freepages is never accurate so kswapd might not - * sleep although it could do so. But this is more desirable for memory - * hotplug than sleeping which can cause a livelock in the direct - * reclaim path. - */ - free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } @@ -1965,9 +1944,24 @@ zonelist_scan: continue; default: /* did we reclaim enough */ - if (!zone_watermark_ok(zone, order, mark, + if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) + goto try_this_zone; + + /* + * Failed to reclaim enough to meet watermark. + * Only mark the zone full if checking the min + * watermark or if we failed to reclaim just + * 1<<order pages or else the page allocator + * fastpath will prematurely mark zones full + * when the watermark is between the low and + * min watermarks. + */ + if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || + ret == ZONE_RECLAIM_SOME) goto this_zone_full; + + continue; } } @@ -2027,6 +2021,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) return; /* + * Walking all memory to count page types is very expensive and should + * be inhibited in non-blockable contexts. + */ + if (!(gfp_mask & __GFP_WAIT)) + filter |= SHOW_MEM_FILTER_PAGE_COUNT; + + /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set * of allowed nodes. @@ -2163,8 +2164,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page = NULL; - if (!order) return NULL; @@ -2176,16 +2175,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction, &page); + contended_compaction); current->flags &= ~PF_MEMALLOC; - /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); - goto got_page; - } - if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; + /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2195,7 +2190,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { -got_page: preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; @@ -2656,10 +2650,17 @@ retry_cpuset: page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); - if (unlikely(!page)) + if (unlikely(!page)) { + /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. + */ + gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + } trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2831,18 +2832,27 @@ void free_pages_exact(void *virt, size_t size) } EXPORT_SYMBOL(free_pages_exact); -static unsigned int nr_free_zone_pages(int offset) +/** + * nr_free_zone_pages - count number of pages beyond high watermark + * @offset: The zone index of the highest zone + * + * nr_free_zone_pages() counts the number of counts pages which are beyond the + * high watermark within all zones at or below a given zone index. For each + * zone, the number of pages is calculated as: + * managed_pages - high_pages + */ +static unsigned long nr_free_zone_pages(int offset) { struct zoneref *z; struct zone *zone; /* Just pick one node, since fallback list is circular */ - unsigned int sum = 0; + unsigned long sum = 0; struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->present_pages; + unsigned long size = zone->managed_pages; unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -2851,19 +2861,25 @@ static unsigned int nr_free_zone_pages(int offset) return sum; } -/* - * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL +/** + * nr_free_buffer_pages - count number of pages beyond high watermark + * + * nr_free_buffer_pages() counts the number of pages which are beyond the high + * watermark within ZONE_DMA and ZONE_NORMAL. */ -unsigned int nr_free_buffer_pages(void) +unsigned long nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -/* - * Amount of free RAM allocatable within all zones +/** + * nr_free_pagecache_pages - count number of pages beyond high watermark + * + * nr_free_pagecache_pages() counts the number of pages which are beyond the + * high watermark within all zones. */ -unsigned int nr_free_pagecache_pages(void) +unsigned long nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } @@ -2890,12 +2906,16 @@ EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_present_pages; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], NR_FREE_PAGES); #else @@ -2938,7 +2958,9 @@ static void show_migration_types(unsigned char type) #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif +#ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = 'I', +#endif }; char tmp[MIGRATE_TYPES + 1]; char *p = tmp; @@ -3113,6 +3135,8 @@ void show_free_areas(unsigned int filter) printk("= %lukB\n", K(total)); } + hugetlb_show_meminfo(); + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); show_swap_cache_info(); @@ -3130,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -3145,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -3230,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); - if (write) - strcpy(saved_string, (char*)table->data); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; - if (__parse_numa_zonelist_order((char*)table->data)) { + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { /* * bogus value. restore saved string */ - strncpy((char*)table->data, saved_string, + strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { @@ -3277,7 +3306,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; - int best_node = -1; + int best_node = NUMA_NO_NODE; const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ @@ -3333,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3348,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3405,8 +3433,8 @@ static int default_zonelist_order(void) z = &NODE_DATA(nid)->node_zones[zone_type]; if (populated_zone(z)) { if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; + low_kmem_size += z->managed_pages; + total_size += z->managed_pages; } else if (zone_type == ZONE_NORMAL) { /* * If any node has only lowmem, then node order @@ -3556,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3569,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; @@ -3685,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } @@ -3821,7 +3847,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) * the block. */ start_pfn = zone->zone_start_pfn; - end_pfn = start_pfn + zone->spanned_pages; + end_pfn = zone_end_pfn(zone); start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; @@ -3917,8 +3943,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); - reset_page_last_nid(page); + page_mapcount_reset(page); + page_nid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -3935,7 +3961,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * pfn out of zone. */ if ((z->zone_start_pfn <= pfn) - && (pfn < z->zone_start_pfn + z->spanned_pages) + && (pfn < zone_end_pfn(z)) && !(pfn & (pageblock_nr_pages - 1))) set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -3973,7 +3999,7 @@ static int __meminit zone_batchsize(struct zone *zone) * * OK, so we don't know how big the cache is. So guess. */ - batch = zone->present_pages / 1024; + batch = zone->managed_pages / 1024; if (batch * PAGE_SIZE > 512 * 1024) batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ @@ -4012,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + +/* a companion to pageset_set_high() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); +} + +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4021,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } -static void __meminit setup_zone_pageset(struct zone *zone) +static void __meminit pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { - int cpu; - - zone->pageset = alloc_percpu(struct per_cpu_pageset); + if (percpu_pagelist_fraction) + pageset_set_high(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - setup_pageset(pcp, zone_batchsize(zone)); + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->present_pages / - percpu_pagelist_fraction)); - } +static void __meminit setup_zone_pageset(struct zone *zone) +{ + int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* @@ -4169,10 +4238,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; int i, nid; + /* + * NOTE: The following SMP-unsafe globals are only used early in boot + * when the kernel is running single-threaded. + */ + static unsigned long __meminitdata last_start_pfn, last_end_pfn; + static int __meminitdata last_nid; + + if (last_start_pfn <= pfn && pfn < last_end_pfn) + return last_nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) + if (start_pfn <= pfn && pfn < end_pfn) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; return nid; + } /* This is a memory hole */ return -1; } @@ -4335,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4396,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4416,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4423,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4434,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4462,10 +4554,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, * round what is now in bits to nearest long in bits, then return it in * bytes. */ -static unsigned long __init usemap_size(unsigned long zonesize) +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; + zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; @@ -4475,17 +4568,19 @@ static unsigned long __init usemap_size(unsigned long zonesize) } static void __init setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) { - unsigned long usemapsize = usemap_size(zonesize); + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, usemapsize); } #else -static inline void setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) {} +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -4554,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4575,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* @@ -4611,7 +4710,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, nr_all_pages += freesize; zone->spanned_pages = size; - zone->present_pages = freesize; + zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. @@ -4636,7 +4735,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, continue; set_pageblock_order(); - setup_usemap(pgdat, zone, size); + setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); @@ -4663,7 +4762,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); - end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); @@ -4690,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); @@ -4697,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; init_zone_allows_reclaim(nid); - calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4706,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -4715,7 +4821,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* * Figure out the number of possible node ids. */ -static void __init setup_nr_node_ids(void) +void __init setup_nr_node_ids(void) { unsigned int node; unsigned int highest = 0; @@ -4724,10 +4830,6 @@ static void __init setup_nr_node_ids(void) highest = node; nr_node_ids = highest + 1; } -#else -static inline void setup_nr_node_ids(void) -{ -} #endif /** @@ -5118,6 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +void adjust_managed_page_count(struct page *page, long count) +{ + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif + spin_unlock(&managed_page_count_lock); +} +EXPORT_SYMBOL(adjust_managed_page_count); + +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +{ + void *pos; + unsigned long pages = 0; + + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + if ((unsigned int)poison <= 0xFF) + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK (%p - %p)\n", + s, pages << (PAGE_SHIFT - 10), start, end); + + return pages; +} +EXPORT_SYMBOL(free_reserved_area); + +#ifdef CONFIG_HIGHMEM +void free_highmem_page(struct page *page) +{ + __free_reserved_page(page); + totalram_pages++; + page_zone(page)->managed_pages++; + totalhigh_pages++; +} +#endif + + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + printk("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved @@ -5198,8 +5395,8 @@ static void calculate_totalreserve_pages(void) /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->present_pages) - max = zone->present_pages; + if (max > zone->managed_pages) + max = zone->managed_pages; reserve_pages += max; /* * Lowmem reserves are not available to @@ -5231,7 +5428,7 @@ static void setup_per_zone_lowmem_reserve(void) for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long present_pages = zone->present_pages; + unsigned long managed_pages = zone->managed_pages; zone->lowmem_reserve[j] = 0; @@ -5245,9 +5442,9 @@ static void setup_per_zone_lowmem_reserve(void) sysctl_lowmem_reserve_ratio[idx] = 1; lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = present_pages / + lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; - present_pages += lower_zone->present_pages; + managed_pages += lower_zone->managed_pages; } } } @@ -5266,14 +5463,14 @@ static void __setup_per_zone_wmarks(void) /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->present_pages; + lowmem_pages += zone->managed_pages; } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->present_pages; + tmp = (u64)pages_min * zone->managed_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* @@ -5285,13 +5482,10 @@ static void __setup_per_zone_wmarks(void) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ - int min_pages; + unsigned long min_pages; - min_pages = zone->present_pages / 1024; - if (min_pages < SWAP_CLUSTER_MAX) - min_pages = SWAP_CLUSTER_MAX; - if (min_pages > 128) - min_pages = 128; + min_pages = zone->managed_pages / 1024; + min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { /* @@ -5352,7 +5546,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) unsigned int gb, ratio; /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + gb = zone->managed_pages >> (30 - PAGE_SHIFT); if (gb) ratio = int_sqrt(10 * gb); else @@ -5396,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5421,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - if (write) + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } @@ -5438,7 +5641,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_unmapped_pages = (zone->present_pages * + zone->min_unmapped_pages = (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -5454,7 +5657,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_slab_pages = (zone->present_pages * + zone->min_slab_pages = (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } @@ -5482,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5493,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -5631,7 +5835,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - zone->zone_start_pfn; + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } @@ -5683,8 +5887,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); - VM_BUG_ON(pfn < zone->zone_start_pfn); - VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); + VM_BUG_ON(!zone_spans_pfn(zone, pfn)); for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) if (flags & value) @@ -5782,8 +5985,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone = page_zone(page); pfn = page_to_pfn(page); - if (zone->zone_start_pfn > pfn || - zone->zone_start_pfn + zone->spanned_pages <= pfn) + if (!zone_spans_pfn(zone, pfn)) return false; return !has_unmovable_pages(zone, page, 0, true); @@ -5839,14 +6041,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; - ret = migrate_pages(&cc->migratepages, - alloc_migrate_target, - 0, false, MIGRATE_SYNC, - MR_CMA); + ret = migrate_pages(&cc->migratepages, alloc_migrate_target, + 0, MIGRATE_SYNC, MR_CMA); } - - putback_movable_pages(&cc->migratepages); - return ret > 0 ? 0 : ret; + if (ret < 0) { + putback_movable_pages(&cc->migratepages); + return ret; + } + return 0; } /** @@ -5991,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { - stop_machine(__zone_pcp_update, zone, NULL); + unsigned cpu; + mutex_lock(&pcp_batch_high_lock); + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); + mutex_unlock(&pcp_batch_high_lock); } #endif @@ -6086,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); diff --git a/mm/page_io.c b/mm/page_io.c index 78eee32ee486..ba05b64e5d8d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,6 +20,8 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/frontswap.h> +#include <linux/aio.h> +#include <linux/blkdev.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -35,14 +37,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } return bio; } -static void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err) imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_sector); - } else { - SetPageUptodate(page); + goto out; } + + SetPageUptodate(page); + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (likely(PageSwapCache(page))) { + struct swap_info_struct *sis; + + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } + } + +out: unlock_page(page); bio_put(bio); } @@ -185,9 +231,7 @@ bad_bmap: */ int swap_writepage(struct page *page, struct writeback_control *wbc) { - struct bio *bio; - int ret = 0, rw = WRITE; - struct swap_info_struct *sis = page_swap_info(page); + int ret = 0; if (try_to_free_swap(page)) { unlock_page(page); @@ -199,6 +243,17 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) end_page_writeback(page); goto out; } + ret = __swap_writepage(page, wbc, end_swap_bio_write); +out: + return ret; +} + +int __swap_writepage(struct page *page, struct writeback_control *wbc, + void (*end_write_func)(struct bio *, int)) +{ + struct bio *bio; + int ret = 0, rw = WRITE; + struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; @@ -214,6 +269,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) kiocb.ki_left = PAGE_SIZE; kiocb.ki_nbytes = PAGE_SIZE; + set_page_writeback(page); unlock_page(page); ret = mapping->a_ops->direct_IO(KERNEL_WRITE, &kiocb, &iov, @@ -222,11 +278,27 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; + } else { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * rotate_reclaimable_page but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ + set_page_dirty(page); + ClearPageReclaim(page); + pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", + page_file_offset(page)); } + end_page_writeback(page); return ret; } - bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); unlock_page(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d2264ea4606..383bdbb98b04 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,28 +8,6 @@ #include <linux/memory.h> #include "internal.h" -/* called while holding zone->lock */ -static void set_pageblock_isolate(struct page *page) -{ - if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) - return; - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - page_zone(page)->nr_pageblock_isolate++; -} - -/* called while holding zone->lock */ -static void restore_pageblock_isolate(struct page *page, int migratetype) -{ - struct zone *zone = page_zone(page); - if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) - return; - - BUG_ON(zone->nr_pageblock_isolate <= 0); - set_pageblock_migratetype(page, migratetype); - zone->nr_pageblock_isolate--; -} - int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -80,7 +58,7 @@ out: unsigned long nr_pages; int migratetype = get_pageblock_migratetype(page); - set_pageblock_isolate(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) goto out; nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); - restore_pageblock_isolate(page, migratetype); + set_pageblock_migratetype(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 35aa294656cd..5da2cbcfdbb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - return NULL; -} - static int walk_hugetlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* - * handle hugetlb vma individually because pagetable walk for - * the hugetlb page is dependent on the architecture and - * we can't handled it in the same manner as non-huge pages. + * This function was not intended to be vma based. + * But there are vma special cases to be handled: + * - hugetlb vma's + * - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* + * There are no page structures backing a VM_PFNMAP + * range, so do not allow split_huge_page_pmd(). + */ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* - * Hugepage is very tightly coupled with vma, so - * walk through hugetlb entries within a given vma. + * Handle hugetlb vma individually because pagetable + * walk for the hugetlb page is dependent on the + * architecture and we can't handled it in the same + * manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, + * so walk through hugetlb entries within a + * given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } } if (pgd_none_or_clear_bad(pgd)) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c8323fe6c8f..e1a6e4fab016 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { assert_spin_locked(&mm->page_table_lock); @@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* no "address" argument so destroys page coloring of some arch */ -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 926b46649749..fd26d0433509 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid, if (flags != 0) return -EINVAL; - if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) - goto out; - - if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) - goto out; - if (vm_write) rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, iovstack_l, @@ -459,8 +453,6 @@ free_iovecs: kfree(iov_r); if (iov_l != iovstack_l) kfree(iov_l); - -out: return rc; } diff --git a/mm/readahead.c b/mm/readahead.c index 7963f2391236..829a77c62834 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -48,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping, if (!trylock_page(page)) BUG(); page->mapping = mapping; - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); page->mapping = NULL; unlock_page(page); } @@ -576,7 +576,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) +SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { ssize_t ret; struct fd f; @@ -595,10 +595,3 @@ SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) } return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_readahead(long fd, loff_t offset, long count) -{ - return SYSC_readahead((int) fd, offset, (size_t) count); -} -SYSCALL_ALIAS(sys_readahead, SyS_readahead); -#endif diff --git a/mm/rmap.c b/mm/rmap.c index 2c78f8cadc95..cd356df4f71a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) */ if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); @@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) avc = NULL; } spin_unlock(&mm->page_table_lock); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); @@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); return 0; @@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, * mapping is already gone, the unmap path will have * set PG_referenced or activated the page. */ - if (likely(!VM_SequentialReadHint(vma))) + if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } pte_unmap_unlock(pte, ptl); @@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) - lru_cache_add_lru(page, LRU_ACTIVE_ANON); - else + if (!mlocked_vma_newpage(vma, page)) { + SetPageActive(page); + lru_cache_add(page); + } else add_page_to_unevictable_list(page); } @@ -1126,7 +1127,6 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { - struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1144,29 +1144,6 @@ void page_remove_rmap(struct page *page) goto out; /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - * - * And we can skip it on file pages, so long as the filesystem - * participates in dirty tracking (note that this is not only an - * optimization but also solves problems caused by dirty flag in - * storage key getting set by a write from inside kernel); but need to - * catch shm and tmpfs and ramfs pages which have been modified since - * creation by read fault. - * - * Note that mapping must be decided above, before decrementing - * mapcount (which luckily provides a barrier): once page is unmapped, - * it could be truncated and page->mapping reset to NULL at any moment. - * Note also that we are relying on page_mapping(page) to set mapping - * to &swapper_space when PageSwapCache(page). - */ - if (mapping && !mapping_cap_account_dirty(mapping) && - page_test_and_clear_dirty(page_to_pfn(page), 1)) - set_page_dirty(page); - /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. */ @@ -1537,6 +1514,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; + if (PageHuge(page)) + pgoff = page->index << compound_order(page); + mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); diff --git a/mm/shmem.c b/mm/shmem.c index 5c90d84c2b02..a87990cf9f94 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -25,11 +25,13 @@ #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> +#include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> +#include <linux/aio.h> static struct vfsmount *shm_mnt; @@ -335,19 +337,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages, pgoff_t *indices) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_pages) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, indices, start, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; if (radix_tree_exception(page)) { @@ -364,17 +366,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } export: - indices[ret] = indices[i]; + indices[ret] = iter.index; pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); return ret; } @@ -889,7 +890,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ - mpol_to_str(buffer, sizeof(buffer), mpol, 1); + mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } @@ -1295,7 +1296,7 @@ unlock: static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); int error; int ret = VM_FAULT_LOCKED; @@ -1313,14 +1314,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); pgoff_t index; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -1330,7 +1331,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, int shmem_lock(struct file *file, int lock, struct user_struct *user) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; @@ -1465,7 +1466,7 @@ shmem_write_end(struct file *file, struct address_space *mapping, static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; @@ -1797,10 +1798,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) } } - if (offset >= 0 && offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); mutex_unlock(&inode->i_mutex); return offset; } @@ -1808,7 +1806,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_falloc shmem_falloc; pgoff_t start, index, end; @@ -1938,6 +1936,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { +#ifdef CONFIG_TMPFS_POSIX_ACL + error = generic_acl_init(inode, dir); + if (error) { + iput(inode); + return error; + } +#endif error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); @@ -1947,6 +1952,33 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) return error; } } + + error = 0; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + } + return error; +} + +static int +shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + if (inode) { + error = security_inode_init_security(inode, dir, + NULL, + shmem_initxattrs, NULL); + if (error) { + if (error != -EOPNOTSUPP) { + iput(inode); + return error; + } + } #ifdef CONFIG_TMPFS_POSIX_ACL error = generic_acl_init(inode, dir); if (error) { @@ -1956,10 +1988,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) #else error = 0; #endif - dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_tmpfile(dentry, inode); } return error; } @@ -2351,7 +2380,7 @@ static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, { if (*len < 3) { *len = 3; - return 255; + return FILEID_INVALID; } if (inode_unhashed(inode)) { @@ -2386,6 +2415,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, bool remount) { char *this_char, *value, *rest; + struct mempolicy *mpol = NULL; uid_t uid; gid_t gid; @@ -2414,7 +2444,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, printk(KERN_ERR "tmpfs: No value for mount option '%s'\n", this_char); - return 1; + goto error; } if (!strcmp(this_char,"size")) { @@ -2463,19 +2493,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (!gid_valid(sbinfo->gid)) goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (mpol_parse_str(value, &sbinfo->mpol, 1)) + mpol_put(mpol); + mpol = NULL; + if (mpol_parse_str(value, &mpol)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", this_char); - return 1; + goto error; } } + sbinfo->mpol = mpol; return 0; bad_val: printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); +error: + mpol_put(mpol); return 1; } @@ -2487,6 +2522,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) unsigned long inodes; int error = -EINVAL; + config.mpol = NULL; if (shmem_parse_options(data, &config, true)) return error; @@ -2511,8 +2547,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; - mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (config.mpol) { + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ + } out: spin_unlock(&sbinfo->stat_lock); return error; @@ -2545,6 +2586,7 @@ static void shmem_put_super(struct super_block *sb) struct shmem_sb_info *sbinfo = SHMEM_SB(sb); percpu_counter_destroy(&sbinfo->used_blocks); + mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } @@ -2709,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename, + .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, @@ -2766,6 +2809,7 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = shmem_mount, .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int __init shmem_init(void) @@ -2817,12 +2861,11 @@ out4: * effectively equivalent, but much lighter weight. */ -#include <linux/ramfs.h> - static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int __init shmem_init(void) @@ -2865,6 +2908,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ +static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", + dentry->d_name.name); +} + +static struct dentry_operations anon_ops = { + .d_dname = shmem_dname +}; + /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps @@ -2873,15 +2926,14 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); */ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { - int error; - struct file *file; + struct file *res; struct inode *inode; struct path path; - struct dentry *root; + struct super_block *sb; struct qstr this; if (IS_ERR(shm_mnt)) - return (void *)shm_mnt; + return ERR_CAST(shm_mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); @@ -2889,43 +2941,41 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - error = -ENOMEM; + res = ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); this.hash = 0; /* will go */ - root = shm_mnt->mnt_root; - path.dentry = d_alloc(root, &this); + sb = shm_mnt->mnt_sb; + path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; + d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(shm_mnt); - error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); + res = ERR_PTR(-ENOSPC); + inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto put_dentry; d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ -#ifndef CONFIG_MMU - error = ramfs_nommu_expand_for_mapping(inode, size); - if (error) + res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (IS_ERR(res)) goto put_dentry; -#endif - error = -ENFILE; - file = alloc_file(&path, FMODE_WRITE | FMODE_READ, + res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); - if (!file) + if (IS_ERR(res)) goto put_dentry; - return file; + return res; put_dentry: path_put(&path); put_memory: shmem_unacct_size(flags, size); - return ERR_PTR(error); + return res; } EXPORT_SYMBOL_GPL(shmem_file_setup); diff --git a/mm/slab.c b/mm/slab.c index 57ab42297d96..35cb0c861508 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -714,7 +714,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); - add_taint(TAINT_BAD_PAGE); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } #endif @@ -1923,11 +1923,9 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); - print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); - printk("\n"); + printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + *dbg_userword(cachep, objp), + *dbg_userword(cachep, objp)); } realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; diff --git a/mm/slab_common.c b/mm/slab_common.c index eacdffaf71c9..538bade6df7d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -373,8 +373,10 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { int index; - if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE)) + if (size > KMALLOC_MAX_SIZE) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; + } if (size <= 192) { if (!size) @@ -446,18 +448,18 @@ void __init create_kmalloc_caches(unsigned long flags) if (!kmalloc_caches[i]) { kmalloc_caches[i] = create_kmalloc_cache(NULL, 1 << i, flags); + } - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); - } + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); } /* Kmalloc array is now usable */ diff --git a/mm/slob.c b/mm/slob.c index 3d73b3b8fb1d..91bd3f2dd2f0 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -360,7 +360,7 @@ static void slob_free(void *block, int size) clear_slob_page_free(sp); spin_unlock_irqrestore(&slob_lock, flags); __ClearPageSlab(sp); - reset_page_mapcount(sp); + page_mapcount_reset(sp); slob_free_pages(b, 0); return; } diff --git a/mm/slub.c b/mm/slub.c index 33f71330e713..3b482c863002 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include "slab.h" #include <linux/proc_fs.h> +#include <linux/notifier.h> #include <linux/seq_file.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> @@ -571,7 +572,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); - add_taint(TAINT_BAD_PAGE); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) @@ -1417,7 +1418,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); memcg_release_pages(s, order); - reset_page_mapcount(page); + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_memcg_kmem_pages(page, order); @@ -3443,7 +3444,6 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -3558,7 +3558,10 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG */ +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; /******************************************************************** * Basic setup of slabs @@ -3615,7 +3618,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + register_hotmemory_notifier(&slab_memory_callback_nb); /* Able to allocate the per node structures */ slab_state = PARTIAL; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22ab9b09..27eeab3be757 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,10 +53,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) struct page *page; if (node_state(node, N_HIGH_MEMORY)) - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, get_order(size)); + page = alloc_pages_node( + node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + get_order(size)); else - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + page = alloc_pages( + GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, get_order(size)); if (page) return page_address(page); @@ -145,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) return pgd; } -int __meminit vmemmap_populate_basepages(struct page *start_page, - unsigned long size, int node) +int __meminit vmemmap_populate_basepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr = start; pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -176,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page, struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) { - struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); - int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); - if (error) + unsigned long start; + unsigned long end; + struct page *map; + + map = pfn_to_page(pnum * PAGES_PER_SECTION); + start = (unsigned long)map; + end = (unsigned long)(map + PAGES_PER_SECTION); + + if (vmemmap_populate(start, end, nid)) return NULL; return map; diff --git a/mm/sparse.c b/mm/sparse.c index 6b5fb762e2ca..308d50331bc3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -79,7 +79,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) { unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; - int ret = 0; if (mem_section[root]) return -EEXIST; @@ -90,7 +89,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) mem_section[root] = section; - return ret; + return 0; } #else /* !SPARSEMEM_EXTREME */ static inline int sparse_index_init(unsigned long section_nr, int nid) @@ -481,6 +480,9 @@ void __init sparse_init(void) struct page **map_map; #endif + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ set_pageblock_order(); @@ -615,11 +617,20 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, } static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - return; /* XXX: Not implemented yet */ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + vmemmap_free(start, end); } +#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + vmemmap_free(start, end); } +#endif /* CONFIG_MEMORY_HOTREMOVE */ #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { @@ -657,6 +668,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) get_order(sizeof(struct page) * nr_pages)); } +#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; @@ -683,40 +695,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) put_page_bootmem(page); } } +#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static void free_section_usemap(struct page *memmap, unsigned long *usemap) -{ - struct page *usemap_page; - unsigned long nr_pages; - - if (!usemap) - return; - - usemap_page = virt_to_page(usemap); - /* - * Check to see if allocation came from hot-plug-add - */ - if (PageSlab(usemap_page)) { - kfree(usemap); - if (memmap) - __kfree_section_memmap(memmap, PAGES_PER_SECTION); - return; - } - - /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. - */ - - if (memmap) { - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - free_map_bootmem(memmap, nr_pages); - } -} - /* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in @@ -772,6 +753,7 @@ out: return ret; } +#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -782,7 +764,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) for (i = 0; i < PAGES_PER_SECTION; i++) { if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &mce_bad_pages); + atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); } } @@ -793,11 +775,45 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + struct page *usemap_page; + unsigned long nr_pages; + + if (!usemap) + return; + + usemap_page = virt_to_page(usemap); + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(usemap_page) || PageCompound(usemap_page)) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap, PAGES_PER_SECTION); + return; + } + + /* + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. + */ + + if (memmap) { + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + free_map_bootmem(memmap, nr_pages); + } +} + void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) { struct page *memmap = NULL; - unsigned long *usemap = NULL; + unsigned long *usemap = NULL, flags; + struct pglist_data *pgdat = zone->zone_pgdat; + pgdat_resize_lock(pgdat, &flags); if (ms->section_mem_map) { usemap = ms->pageblock_flags; memmap = sparse_decode_mem_map(ms->section_mem_map, @@ -805,8 +821,10 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) ms->section_mem_map = 0; ms->pageblock_flags = NULL; } + pgdat_resize_unlock(pgdat, &flags); clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); free_section_usemap(memmap, usemap); } -#endif +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index 6310dc2008ff..4a1d0d2c52fa 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,13 +30,17 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> +#include <linux/uio.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/pagemap.h> + /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); @@ -383,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); + trace_mm_lru_activate(page, page_to_pfn(page)); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -427,6 +432,33 @@ void activate_page(struct page *page) } #endif +static void __lru_cache_activate_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + int i; + + /* + * Search backwards on the optimistic assumption that the page being + * activated has just been added to this pagevec. Note that only + * the local pagevec is examined as a !PageLRU page could be in the + * process of being released, reclaimed, migrated or on a remote + * pagevec that is currently being drained. Furthermore, marking + * a remote pagevec's page PageActive potentially hits a race where + * a page is marked PageActive just after it is added to the inactive + * list causing accounting errors and BUG_ON checks to trigger. + */ + for (i = pagevec_count(pvec) - 1; i >= 0; i--) { + struct page *pagevec_page = pvec->pages[i]; + + if (pagevec_page == page) { + SetPageActive(page); + break; + } + } + + put_cpu_var(lru_add_pvec); +} + /* * Mark a page as having seen activity. * @@ -437,8 +469,18 @@ void activate_page(struct page *page) void mark_page_accessed(struct page *page) { if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page) && PageLRU(page)) { - activate_page(page); + PageReferenced(page)) { + + /* + * If the page is on the LRU, queue it for activation via + * activate_page_pvecs. Otherwise, assume the page is on a + * pagevec, mark it active and it'll be moved to the active + * LRU on the next drain. + */ + if (PageLRU(page)) + activate_page(page); + else + __lru_cache_activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); @@ -447,42 +489,37 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); /* - * Order of operations is important: flush the pagevec when it's already - * full, not when adding the last page, to make sure that last page is - * not added to the LRU directly when passed to this function. Because - * mark_page_accessed() (called after this when writing) only activates - * pages that are on the LRU, linear writes in subpage chunks would see - * every PAGEVEC_SIZE page activated, which is unexpected. + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of __lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ -void __lru_cache_add(struct page *page, enum lru_list lru) +void __lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); page_cache_get(page); if (!pagevec_space(pvec)) - __pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec); pagevec_add(pvec, page); - put_cpu_var(lru_add_pvecs); + put_cpu_var(lru_add_pvec); } EXPORT_SYMBOL(__lru_cache_add); /** - * lru_cache_add_lru - add a page to a page list + * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. - * @lru: the LRU list to which the page is added. */ -void lru_cache_add_lru(struct page *page, enum lru_list lru) +void lru_cache_add(struct page *page) { if (PageActive(page)) { VM_BUG_ON(PageUnevictable(page)); - ClearPageActive(page); } else if (PageUnevictable(page)) { VM_BUG_ON(PageActive(page)); - ClearPageUnevictable(page); } - VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); - __lru_cache_add(page, lru); + VM_BUG_ON(PageLRU(page)); + __lru_cache_add(page); } /** @@ -582,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); - struct pagevec *pvec; - int lru; + struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); - for_each_lru(lru) { - pvec = &pvecs[lru - LRU_BASE]; - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec, lru); - } + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -707,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold) del_page_from_lru_list(page, lruvec, page_off_lru(page)); } + /* Clear Active bit in case of parallel mark_page_accessed */ + ClearPageActive(page); + list_add(&page->lru, &pages_to_free); } if (zone) @@ -737,7 +772,7 @@ EXPORT_SYMBOL(__pagevec_release); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ void lru_add_page_tail(struct page *page, struct page *page_tail, - struct lruvec *lruvec) + struct lruvec *lruvec, struct list_head *list) { int uninitialized_var(active); enum lru_list lru; @@ -749,7 +784,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); - SetPageLRU(page_tail); + if (!list) + SetPageLRU(page_tail); if (page_evictable(page_tail)) { if (PageActive(page)) { @@ -767,7 +803,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, if (likely(PageLRU(page))) list_add_tail(&page_tail->lru, &page->lru); - else { + else if (list) { + /* page reclaim is reclaiming a huge page */ + get_page(page_tail); + list_add_tail(&page_tail->lru, list); + } else { struct list_head *list_head; /* * Head page has not yet been counted, as an hpage, @@ -789,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg) { - enum lru_list lru = (enum lru_list)arg; - int file = is_file_lru(lru); - int active = is_active_lru(lru); + int file = page_is_file_cache(page); + int active = PageActive(page); + enum lru_list lru = page_lru(page); - VM_BUG_ON(PageActive(page)); VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - if (active) - SetPageActive(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); + trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); } /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec) { - VM_BUG_ON(is_unevictable_lru(lru)); - - pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); } EXPORT_SYMBOL(__pagevec_lru_add); @@ -855,9 +891,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); - #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + bdi_init(swapper_spaces[0].backing_dev_info); + for (i = 0; i < MAX_SWAPFILES; i++) { + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 0cb36fb1f61c..f24ab0dff554 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,13 +53,24 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", + get_nr_swap_pages() << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } @@ -67,9 +78,10 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapCache(page)); @@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); set_page_private(page, entry.val); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, + entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + struct address_space *address_space; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -141,7 +160,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page) +int add_to_swap(struct page *page, struct list_head *list) { swp_entry_t entry; int err; @@ -154,7 +173,7 @@ int add_to_swap(struct page *page) return 0; if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) { + if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry, NULL); return 0; } @@ -195,12 +214,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page) INC_CACHE_INFO(find_success); @@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), + entry.val); if (found_page) break; @@ -314,8 +336,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ + if (err == -EEXIST) { radix_tree_preload_end(); + /* + * We might race against get_swap_page() and stumble + * across a SWAP_HAS_CACHE swap_map entry whose page + * has not been brought into the swapcache yet, while + * the other end is scheduled away waiting on discard + * I/O completion at scan_swap_map(). + * + * In order to avoid turning this transitory state + * into a permanent loop around this -EEXIST case + * if !CONFIG_PREEMPT and the I/O completion happens + * to be waiting on the CPU waitqueue where we are now + * busy looping, we just conditionally invoke the + * scheduler here, if there are some more important + * tasks to run. + */ + cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ diff --git a/mm/swapfile.c b/mm/swapfile.c index e97a0e5aea91..36af6eeaa67e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; -long nr_swap_pages; +atomic_long_t nr_swap_pages; +/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; +static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -210,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { + if (si->flags & SWP_PAGE_DISCARD) { /* * Start range check on racing allocations, in case * they overlap the cluster we eventually decide on @@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->lowest_alloc = si->max; si->highest_alloc = 0; } - spin_unlock(&swap_lock); + spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from @@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } offset = scan_base; - spin_lock(&swap_lock); + spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; si->lowest_alloc = 0; } @@ -293,9 +295,9 @@ checks: /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - spin_unlock(&swap_lock); + spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); - spin_lock(&swap_lock); + spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; @@ -320,7 +322,7 @@ checks: if (si->lowest_alloc) { /* - * Only set when SWP_DISCARDABLE, and there's a scan + * Only set when SWP_PAGE_DISCARD, and there's a scan * for a free cluster in progress or just completed. */ if (found_free_cluster) { @@ -335,13 +337,13 @@ checks: si->lowest_alloc <= last_in_cluster) last_in_cluster = si->lowest_alloc - 1; si->flags |= SWP_DISCARDING; - spin_unlock(&swap_lock); + spin_unlock(&si->lock); if (offset < last_in_cluster) discard_swap_cluster(si, offset, last_in_cluster - offset + 1); - spin_lock(&swap_lock); + spin_lock(&si->lock); si->lowest_alloc = 0; si->flags &= ~SWP_DISCARDING; @@ -355,10 +357,10 @@ checks: * could defer that delay until swap_writepage, * but it's easier to keep this self-contained. */ - spin_unlock(&swap_lock); + spin_unlock(&si->lock); wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&swap_lock); + spin_lock(&si->lock); } else { /* * Note pages allocated by racing tasks while @@ -374,14 +376,14 @@ checks: return offset; scan: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { @@ -392,11 +394,11 @@ scan: offset = si->lowest_bit; while (++offset < scan_base) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { @@ -404,7 +406,7 @@ scan: latency_ration = LATENCY_LIMIT; } } - spin_lock(&swap_lock); + spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; @@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void) pgoff_t offset; int type, next; int wrapped = 0; + int hp_index; spin_lock(&swap_lock); - if (nr_swap_pages <= 0) + if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { + hp_index = atomic_xchg(&highest_priority_index, -1); + /* + * highest_priority_index records current highest priority swap + * type which just frees swap entries. If its priority is + * higher than that of swap_list.next swap type, we use it. It + * isn't protected by swap_lock, so it can be an invalid value + * if the corresponding swap type is swapoff. We double check + * the flags here. It's even possible the swap type is swapoff + * and swapon again and its priority is changed. In such rare + * case, low prority swap type might be used, but eventually + * high priority swap will be used after several rounds of + * swap. + */ + if (hp_index != -1 && hp_index != type && + swap_info[type]->prio < swap_info[hp_index]->prio && + (swap_info[hp_index]->flags & SWP_WRITEOK)) { + type = hp_index; + swap_list.next = type; + } + si = swap_info[type]; next = si->next; if (next < 0 || @@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void) wrapped++; } - if (!si->highest_bit) + spin_lock(&si->lock); + if (!si->highest_bit) { + spin_unlock(&si->lock); continue; - if (!(si->flags & SWP_WRITEOK)) + } + if (!(si->flags & SWP_WRITEOK)) { + spin_unlock(&si->lock); continue; + } swap_list.next = next; + + spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); - if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); + if (offset) return swp_entry(type, offset); - } + spin_lock(&swap_lock); next = swap_list.next; } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); noswap: spin_unlock(&swap_lock); return (swp_entry_t) {0}; @@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type) struct swap_info_struct *si; pgoff_t offset; - spin_lock(&swap_lock); si = swap_info[type]; + spin_lock(&si->lock); if (si && (si->flags & SWP_WRITEOK)) { - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return swp_entry(type, offset); } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); } - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return (swp_entry_t) {0}; } @@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) goto bad_offset; if (!p->swap_map[offset]) goto bad_free; - spin_lock(&swap_lock); + spin_lock(&p->lock); return p; bad_free: @@ -511,6 +541,27 @@ out: return NULL; } +/* + * This swap type frees swap entry, check if it is the highest priority swap + * type which just frees swap entry. get_swap_page() uses + * highest_priority_index to search highest priority swap type. The + * swap_info_struct.lock can't protect us if there are multiple swap types + * active, so we use atomic_cmpxchg. + */ +static void set_highest_priority_index(int type) +{ + int old_hp_index, new_hp_index; + + do { + old_hp_index = atomic_read(&highest_priority_index); + if (old_hp_index != -1 && + swap_info[old_hp_index]->prio >= swap_info[type]->prio) + break; + new_hp_index = type; + } while (atomic_cmpxchg(&highest_priority_index, + old_hp_index, new_hp_index) != old_hp_index); +} + static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (swap_list.next >= 0 && - p->prio > swap_info[swap_list.next]->prio) - swap_list.next = p->type; - nr_swap_pages++; + set_highest_priority_index(p->type); + atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); if (p->flags & SWP_BLKDEV) { @@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { swap_entry_free(p, entry, 1); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) count = swap_entry_free(p, entry, SWAP_HAS_CACHE); if (page) mem_cgroup_uncharge_swapcache(page, entry, count != 0); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -617,7 +666,7 @@ int page_swapcount(struct page *page) p = swap_info_get(entry); if (p) { count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } return count; } @@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), + entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } } - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } if (page) { /* @@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free) if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; + spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } + spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; @@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { + struct page *swapcache; struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; + swapcache = page; + page = ksm_might_need_to_copy(page, vma, addr); + if (unlikely(!page)) + return -ENOMEM; + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &memcg)) { ret = -ENOMEM; @@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - page_add_anon_rmap(page, vma, addr); + if (page == swapcache) + page_add_anon_rmap(page, vma, addr); + else /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, addr); mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* @@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, out: pte_unmap_unlock(pte, ptl); out_nolock: + if (page != swapcache) { + unlock_page(page); + put_page(page); + } return ret; } @@ -1444,8 +1509,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) } static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - unsigned long *frontswap_map) + unsigned char *swap_map) { int i, prev; @@ -1454,9 +1518,8 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; - frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; - nr_swap_pages += p->pages; + atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; /* insert swap space into swap_list: */ @@ -1477,16 +1540,20 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, unsigned long *frontswap_map) { + frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); - _enable_swap_info(p, prio, swap_map, frontswap_map); - frontswap_init(p->type); + spin_lock(&p->lock); + _enable_swap_info(p, prio, swap_map); + spin_unlock(&p->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); - _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); + spin_lock(&p->lock); + _enable_swap_info(p, p->prio, p->swap_map); + spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1494,6 +1561,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; @@ -1546,14 +1614,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) /* just pick something that's safe... */ swap_list.next = swap_list.head; } + spin_lock(&p->lock); if (p->prio < 0) { for (i = p->next; i >= 0; i = swap_info[i]->next) swap_info[i]->prio = p->prio--; least_priority++; } - nr_swap_pages -= p->pages; + atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; + spin_unlock(&p->lock); spin_unlock(&swap_lock); set_current_oom_origin(); @@ -1572,14 +1642,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_lock(&swapon_mutex); spin_lock(&swap_lock); + spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { + spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); + spin_lock(&p->lock); } swap_file = p->swap_file; @@ -1588,11 +1661,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; - frontswap_invalidate_area(type); + frontswap_map = frontswap_map_get(p); + frontswap_map_set(p, NULL); + spin_unlock(&p->lock); spin_unlock(&swap_lock); + frontswap_invalidate_area(type); mutex_unlock(&swapon_mutex); vfree(swap_map); - vfree(frontswap_map_get(p)); + vfree(frontswap_map); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -1699,7 +1775,7 @@ static int swap_show(struct seq_file *swap, void *v) len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", si->pages << (PAGE_SHIFT - 10), si->inuse_pages << (PAGE_SHIFT - 10), @@ -1794,6 +1870,7 @@ static struct swap_info_struct *alloc_swap_info(void) p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + spin_lock_init(&p->lock); return p; } @@ -1939,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -2039,15 +2130,44 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* frontswap enabled? set up bit-per-page map for frontswap */ if (frontswap_enabled) - frontswap_map = vzalloc(maxpages / sizeof(long)); + frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + } + + if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + printk(KERN_ERR + "swapon: discard_swap(%p): %d\n", + p, err); + } } - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; } mutex_lock(&swapon_mutex); @@ -2058,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); @@ -2116,7 +2238,7 @@ void si_swapinfo(struct sysinfo *val) if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += si->inuse_pages; } - val->freeswap = nr_swap_pages + nr_to_be_unused; + val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } @@ -2149,7 +2271,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p = swap_info[type]; offset = swp_offset(entry); - spin_lock(&swap_lock); + spin_lock(&p->lock); if (unlikely(offset >= p->max)) goto unlock_out; @@ -2184,7 +2306,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&swap_lock); + spin_unlock(&p->lock); out: return err; @@ -2309,7 +2431,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return -ENOMEM; } @@ -2357,7 +2479,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); outer: if (page) __free_page(page); diff --git a/mm/truncate.c b/mm/truncate.c index c75b736e54b7..e2e8a8a7eb9d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -26,7 +26,8 @@ /** * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected - * @offset: the index of the truncation point + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * * do_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. @@ -37,24 +38,18 @@ * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void do_invalidatepage(struct page *page, unsigned long offset) +void do_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - void (*invalidatepage)(struct page *, unsigned long); + void (*invalidatepage)(struct page *, unsigned int, unsigned int); + invalidatepage = page->mapping->a_ops->invalidatepage; #ifdef CONFIG_BLOCK if (!invalidatepage) invalidatepage = block_invalidatepage; #endif if (invalidatepage) - (*invalidatepage)(page, offset); -} - -static inline void truncate_partial_page(struct page *page, unsigned partial) -{ - zero_user_segment(page, partial, PAGE_CACHE_SIZE); - cleancache_invalidate_page(page->mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial); + (*invalidatepage)(page, offset, length); } /* @@ -103,7 +98,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) return -EIO; if (page_has_private(page)) - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); cancel_dirty_page(page, PAGE_CACHE_SIZE); @@ -185,11 +180,11 @@ int invalidate_inode_page(struct page *page) * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate - * @lend: offset to which to truncate + * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between - * specified offsets (and zeroing out partial page - * (if lstart is not page aligned)). + * specified offsets (and zeroing out partial pages + * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass @@ -200,35 +195,58 @@ int invalidate_inode_page(struct page *page) * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. + * + * Note that since ->invalidatepage() accepts range to invalidate + * truncate_inode_pages_range is able to handle cases where lend + 1 is not + * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - struct pagevec pvec; - pgoff_t index; - pgoff_t end; - int i; + pgoff_t start; /* inclusive */ + pgoff_t end; /* exclusive */ + unsigned int partial_start; /* inclusive */ + unsigned int partial_end; /* exclusive */ + struct pagevec pvec; + pgoff_t index; + int i; cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0) return; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); - end = (lend >> PAGE_CACHE_SHIFT); + /* Offsets within partial pages */ + partial_start = lstart & (PAGE_CACHE_SIZE - 1); + partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + + /* + * 'start' and 'end' always covers the range of pages to be fully + * truncated. Partial pages are covered with 'partial_start' at the + * start of the range and 'partial_end' at the end of the range. + * Note that 'end' is exclusive while 'lend' is inclusive. + */ + start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (lend == -1) + /* + * lend == -1 indicates end-of-file so we have to set 'end' + * to the highest possible pgoff_t and since the type is + * unsigned we're using -1. + */ + end = -1; + else + end = (lend + 1) >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index < end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE))) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = page->index; - if (index > end) + if (index >= end) break; if (!trylock_page(page)) @@ -247,27 +265,56 @@ void truncate_inode_pages_range(struct address_space *mapping, index++; } - if (partial) { + if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + /* Truncation within a single page */ + top = partial_end; + partial_end = 0; + } wait_on_page_writeback(page); - truncate_partial_page(page, partial); + zero_user_segment(page, partial_start, top); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, partial_start, + top - partial_start); unlock_page(page); page_cache_release(page); } } + if (partial_end) { + struct page *page = find_lock_page(mapping, end); + if (page) { + wait_on_page_writeback(page); + zero_user_segment(page, 0, partial_end); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, 0, + partial_end); + unlock_page(page); + page_cache_release(page); + } + } + /* + * If the truncation happened within a single page no pages + * will be released, just zeroed, so we can bail out now. + */ + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + min(end - index, (pgoff_t)PAGEVEC_SIZE))) { if (index == start) break; index = start; continue; } - if (index == start && pvec.pages[0]->index > end) { + if (index == start && pvec.pages[0]->index >= end) { pagevec_release(&pvec); break; } @@ -277,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping, /* We rely upon deletion not changing page->index */ index = page->index; - if (index > end) + if (index >= end) break; lock_page(page); @@ -598,10 +645,8 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) * This rounding is currently just for example: unmap_mapping_range * expands its hole outwards, whereas we want it to contract the hole * inwards. However, existing callers of truncate_pagecache_range are - * doing their own page rounding first; and truncate_inode_pages_range - * currently BUGs if lend is not pagealigned-1 (it handles partial - * page at start of hole, but not partial page at end of hole). Note - * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + * doing their own page rounding first. Note that unmap_mapping_range + * allows holelen 0 for all, and we allow lend -1 for end of file. */ /* diff --git a/mm/util.c b/mm/util.c index c55e26b17d93..7441c41d00f6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,6 +5,8 @@ #include <linux/err.h> #include <linux/sched.h> #include <linux/security.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/uaccess.h> #include "internal.h" @@ -293,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif @@ -355,12 +356,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, { unsigned long ret; struct mm_struct *mm = current->mm; + unsigned long populate; ret = security_mmap_file(file, prot, flag); if (!ret) { down_write(&mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, + &populate); up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); } return ret; } @@ -378,6 +383,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON(PageSlab(page)); +#ifdef CONFIG_SWAP + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else +#endif + if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = NULL; + return mapping; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5123a169ab7b..13a54953a273 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -27,10 +27,30 @@ #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> +#include <linux/llist.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> +struct vfree_deferred { + struct llist_head list; + struct work_struct wq; +}; +static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); + +static void __vunmap(const void *, int); + +static void free_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *llnode = llist_del_all(&p->list); + while (llnode) { + void *p = llnode; + llnode = llist_next(llnode); + __vunmap(p, 1); + } +} + /*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) @@ -249,19 +269,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 -struct vmap_area { - unsigned long va_start; - unsigned long va_end; - unsigned long flags; - struct rb_node rb_node; /* address sorted rbtree */ - struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ - struct vm_struct *vm; - struct rcu_head rcu_head; -}; - static DEFINE_SPINLOCK(vmap_area_lock); -static LIST_HEAD(vmap_area_list); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -282,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; - else if (addr > va->va_start) + else if (addr >= va->va_end) n = n->rb_right; else return va; @@ -313,7 +323,7 @@ static void __insert_vmap_area(struct vmap_area *va) rb_link_node(&va->rb_node, parent, p); rb_insert_color(&va->rb_node, &vmap_area_root); - /* address-sort this list so it is usable like the vmlist */ + /* address-sort this list */ tmp = rb_prev(&va->rb_node); if (tmp) { struct vmap_area *prev; @@ -378,12 +388,12 @@ nocache: addr = ALIGN(first->va_end, align); if (addr < vstart) goto nocache; - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; } else { addr = ALIGN(vstart, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; @@ -410,7 +420,7 @@ nocache: if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; if (list_is_last(&first->list, &vmap_area_list)) @@ -744,7 +754,6 @@ struct vmap_block { struct vmap_area *va; struct vmap_block_queue *vbq; unsigned long free, dirty; - DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; struct rcu_head rcu_head; @@ -810,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vb->va = va; vb->free = VMAP_BBMAP_BITS; vb->dirty = 0; - bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); @@ -863,7 +871,6 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -881,11 +888,6 @@ static void purge_fragmented_blocks(int cpu) } } -static void purge_fragmented_blocks_thiscpu(void) -{ - purge_fragmented_blocks(smp_processor_id()); -} - static void purge_fragmented_blocks_allcpus(void) { int cpu; @@ -900,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) struct vmap_block *vb; unsigned long addr = 0; unsigned int order; - int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -924,17 +925,7 @@ again: if (vb->free < 1UL << order) goto next; - i = bitmap_find_free_region(vb->alloc_map, - VMAP_BBMAP_BITS, order); - - if (i < 0) { - if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { - /* fragmented and no outstanding allocations */ - BUG_ON(vb->dirty != VMAP_BBMAP_BITS); - purge = 1; - } - goto next; - } + i = VMAP_BBMAP_BITS - vb->free; addr = vb->va->va_start + (i << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(vb->va->va_start)); @@ -950,9 +941,6 @@ next: spin_unlock(&vb->lock); } - if (purge) - purge_fragmented_blocks_thiscpu(); - put_cpu_var(vmap_block_queue); rcu_read_unlock(); @@ -1125,6 +1113,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +static struct vm_struct *vmlist __initdata; /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1184,10 +1173,14 @@ void __init vmalloc_init(void) for_each_possible_cpu(i) { struct vmap_block_queue *vbq; + struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, free_work); } /* Import existing vmlist entries. */ @@ -1283,41 +1276,28 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); -/*** Old vmalloc interfaces ***/ -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { + spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; va->flags |= VM_VM_AREA; + spin_unlock(&vmap_area_lock); } -static void insert_vmalloc_vmlist(struct vm_struct *vm) -{ - struct vm_struct *tmp, **p; - - vm->flags &= ~VM_UNLIST; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= vm->addr) - break; - } - vm->next = *p; - *p = vm; - write_unlock(&vmlist_lock); -} - -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) +static void clear_vm_uninitialized_flag(struct vm_struct *vm) { - setup_vmalloc_vm(vm, va, flags, caller); - insert_vmalloc_vmlist(vm); + /* + * Before removing VM_UNINITIALIZED, + * we should make sure that vm has proper values. + * Pair with smp_rmb() in show_numa_info(). + */ + smp_wmb(); + vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, @@ -1328,16 +1308,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) { - int bit = fls(size); - - if (bit > IOREMAP_MAX_ORDER) - bit = IOREMAP_MAX_ORDER; - else if (bit < PAGE_SHIFT) - bit = PAGE_SHIFT; - - align = 1ul << bit; - } + if (flags & VM_IOREMAP) + align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) @@ -1358,17 +1330,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - /* - * When this function is called from __vmalloc_node_range, - * we do not add vm_struct to vmlist here to avoid - * accessing uninitialized members of vm_struct such as - * pages and nr_pages fields. They will be set later. - * To distinguish it from others, we use a VM_UNLIST flag. - */ - if (flags & VM_UNLIST) - setup_vmalloc_vm(area, va, flags, caller); - else - insert_vmalloc_vm(area, va, flags, caller); + setup_vmalloc_vm(area, va, flags, caller); return area; } @@ -1376,8 +1338,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, - __builtin_return_address(0)); + return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); @@ -1385,8 +1347,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { - return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, - caller); + return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, + GFP_KERNEL, caller); } /** @@ -1401,14 +1363,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - -1, GFP_KERNEL, __builtin_return_address(0)); + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - -1, GFP_KERNEL, caller); + NUMA_NO_NODE, GFP_KERNEL, caller); } /** @@ -1446,19 +1409,10 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; - if (!(vm->flags & VM_UNLIST)) { - struct vm_struct *tmp, **p; - /* - * remove from list and disallow access to - * this vm_struct before unmap. (address range - * confliction is maintained by vmap.) - */ - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) - ; - *p = tmp->next; - write_unlock(&vmlist_lock); - } + spin_lock(&vmap_area_lock); + va->vm = NULL; + va->flags &= ~VM_VM_AREA; + spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); @@ -1476,10 +1430,9 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { - WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) return; - } area = remove_vm_area(addr); if (unlikely(!area)) { @@ -1510,7 +1463,7 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1519,15 +1472,26 @@ static void __vunmap(const void *addr, int deallocate_pages) * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is * NULL, no operation is performed. * - * Must not be called in interrupt context. + * Must not be called in NMI context (strictly speaking, only if we don't + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea) + * + * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { - BUG_ON(in_interrupt()); + BUG_ON(in_nmi()); kmemleak_free(addr); - __vunmap(addr, 1); + if (!addr) + return; + if (unlikely(in_interrupt())) { + struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); + } else + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -1544,7 +1508,8 @@ void vunmap(const void *addr) { BUG_ON(in_interrupt()); might_sleep(); - __vunmap(addr, 0); + if (addr) + __vunmap(addr, 0); } EXPORT_SYMBOL(vunmap); @@ -1650,7 +1615,7 @@ fail: * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages - * @node: node to use for allocation or -1 + * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level @@ -1669,20 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, start, end, node, gfp_mask, caller); if (!area) goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); if (!addr) - return NULL; + goto fail; /* - * In this function, newly allocated vm_struct is not added - * to vmlist at __get_vm_area_node(). so, it is added here. + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. + * Now, it is fully initialized, so remove this flag here. */ - insert_vmalloc_vmlist(area); + clear_vm_uninitialized_flag(area); /* * A ref_count = 3 is needed because the vm_struct and vmap_area @@ -1706,7 +1672,7 @@ fail: * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages - * @node: node to use for allocation or -1 + * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level @@ -1723,7 +1689,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, 1, gfp_mask, prot, -1, + return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -1746,7 +1712,8 @@ static inline void *__vmalloc_node_flags(unsigned long size, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); + return __vmalloc_node_flags(size, NUMA_NO_NODE, + GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -1762,7 +1729,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, -1, + return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1781,7 +1748,8 @@ void *vmalloc_user(unsigned long size) ret = __vmalloc_node(size, SHMLBA, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL, -1, __builtin_return_address(0)); + PAGE_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; @@ -1846,7 +1814,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) @@ -1867,7 +1835,7 @@ void *vmalloc_exec(unsigned long size) void *vmalloc_32(unsigned long size) { return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1884,7 +1852,7 @@ void *vmalloc_32_user(unsigned long size) void *ret; ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; @@ -2002,7 +1970,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) long vread(char *buf, char *addr, unsigned long count) { - struct vm_struct *tmp; + struct vmap_area *va; + struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; unsigned long n; @@ -2011,10 +1980,17 @@ long vread(char *buf, char *addr, unsigned long count) if ((unsigned long) addr + count < count) count = -(unsigned long) addr; - read_lock(&vmlist_lock); - for (tmp = vmlist; count && tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + vm->size - PAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -2024,10 +2000,10 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + vm->size - PAGE_SIZE - addr; if (n > count) n = count; - if (!(tmp->flags & VM_IOREMAP)) + if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); @@ -2036,7 +2012,7 @@ long vread(char *buf, char *addr, unsigned long count) count -= n; } finished: - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); if (buf == buf_start) return 0; @@ -2075,7 +2051,8 @@ finished: long vwrite(char *buf, char *addr, unsigned long count) { - struct vm_struct *tmp; + struct vmap_area *va; + struct vm_struct *vm; char *vaddr; unsigned long n, buflen; int copied = 0; @@ -2085,10 +2062,17 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; buflen = count; - read_lock(&vmlist_lock); - for (tmp = vmlist; count && tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + vm->size - PAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -2097,10 +2081,10 @@ long vwrite(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + vm->size - PAGE_SIZE - addr; if (n > count) n = count; - if (!(tmp->flags & VM_IOREMAP)) { + if (!(vm->flags & VM_IOREMAP)) { aligned_vwrite(buf, addr, n); copied++; } @@ -2109,49 +2093,50 @@ long vwrite(char *buf, char *addr, unsigned long count) count -= n; } finished: - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); if (!copied) return 0; return buflen; } /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ -int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, - unsigned long pgoff) +int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long size) { struct vm_struct *area; - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - if ((PAGE_SIZE-1) & (unsigned long)addr) + size = PAGE_ALIGN(size); + + if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; - area = find_vm_area(addr); + area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + if (kaddr + size > area->addr + area->size) return -EINVAL; - addr += pgoff << PAGE_SHIFT; do { - struct page *page = vmalloc_to_page(addr); + struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); @@ -2159,14 +2144,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, return ret; uaddr += PAGE_SIZE; - addr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); + kaddr += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size > 0); vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } +EXPORT_SYMBOL(remap_vmalloc_range_partial); + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_range_partial(vma, vma->vm_start, + addr + (pgoff << PAGE_SHIFT), + vma->vm_end - vma->vm_start); +} EXPORT_SYMBOL(remap_vmalloc_range); /* @@ -2480,8 +2488,8 @@ found: /* insert all vm's */ for (area = 0; area < nr_vms; area++) - insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, - pcpu_get_vm_areas); + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); kfree(vas); return vms; @@ -2516,19 +2524,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmlist_lock) + __acquires(&vmap_area_lock) { loff_t n = *pos; - struct vm_struct *v; + struct vmap_area *va; - read_lock(&vmlist_lock); - v = vmlist; - while (n > 0 && v) { + spin_lock(&vmap_area_lock); + va = list_entry((&vmap_area_list)->next, typeof(*va), list); + while (n > 0 && &va->list != &vmap_area_list) { n--; - v = v->next; + va = list_entry(va->list.next, typeof(*va), list); } - if (!n) - return v; + if (!n && &va->list != &vmap_area_list) + return va; return NULL; @@ -2536,16 +2544,20 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vm_struct *v = p; + struct vmap_area *va = p, *next; ++*pos; - return v->next; + next = list_entry(va->list.next, typeof(*va), list); + if (&next->list != &vmap_area_list) + return next; + + return NULL; } static void s_stop(struct seq_file *m, void *p) - __releases(&vmlist_lock) + __releases(&vmap_area_lock) { - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) @@ -2569,7 +2581,25 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vm_struct *v = p; + struct vmap_area *va = p; + struct vm_struct *v; + + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + return 0; + + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + return 0; + } + + v = va->vm; + + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return 0; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); @@ -2642,5 +2672,53 @@ static int __init proc_vmalloc_init(void) return 0; } module_init(proc_vmalloc_init); + +void get_vmalloc_info(struct vmalloc_info *vmi) +{ + struct vmap_area *va; + unsigned long free_area_size; + unsigned long prev_end; + + vmi->used = 0; + vmi->largest_chunk = 0; + + prev_end = VMALLOC_START; + + spin_lock(&vmap_area_lock); + + if (list_empty(&vmap_area_list)) { + vmi->largest_chunk = VMALLOC_TOTAL; + goto out; + } + + list_for_each_entry(va, &vmap_area_list, list) { + unsigned long addr = va->va_start; + + /* + * Some archs keep another range for modules in vmalloc space + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; + + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + continue; + + vmi->used += (va->va_end - va->va_start); + + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; + + prev_end = va->va_end; + } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + +out: + spin_unlock(&vmap_area_lock); +} #endif diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..736a6011c2c8 --- /dev/null +++ b/mm/vmpressure.c @@ -0,0 +1,374 @@ +/* + * Linux VM pressure + * + * Copyright 2012 Linaro Ltd. + * Anton Vorontsov <anton.vorontsov@linaro.org> + * + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/eventfd.h> +#include <linux/swap.h> +#include <linux/printk.h> +#include <linux/vmpressure.h> + +/* + * The window size (vmpressure_win) is the number of scanned pages before + * we try to analyze scanned/reclaimed ratio. So the window is used as a + * rate-limit tunable for the "low" level notification, and also for + * averaging the ratio for medium/critical levels. Using small window + * sizes can cause lot of false positives, but too big window size will + * delay the notifications. + * + * As the vmscan reclaimer logic works with chunks which are multiple of + * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. + * + * TODO: Make the window size depend on machine size, as we do for vmstat + * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). + */ +static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + +/* + * These thresholds are used when we account memory pressure through + * scanned/reclaimed ratio. The current values were chosen empirically. In + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ +static const unsigned int vmpressure_level_med = 60; +static const unsigned int vmpressure_level_critical = 95; + +/* + * When there are too little pages left to scan, vmpressure() may miss the + * critical pressure as number of pages will be less than "window size". + * However, in that case the vmscan priority will raise fast as the + * reclaimer will try to scan LRUs more deeply. + * + * The vmscan logic considers these special priorities: + * + * prio == DEF_PRIORITY (12): reclaimer starts with that value + * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed + * prio == 0 : close to OOM, kernel scans every page in an lru + * + * Any value in this range is acceptable for this tunable (i.e. from 12 to + * 0). Current value for the vmpressure_level_critical_prio is chosen + * empirically, but the number, in essence, means that we consider + * critical level when scanning depth is ~10% of the lru size (vmscan + * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one + * eights). + */ +static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); + +static struct vmpressure *work_to_vmpressure(struct work_struct *work) +{ + return container_of(work, struct vmpressure, work); +} + +static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) +{ + return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); +} + +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); + + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return memcg_to_vmpressure(memcg); +} + +enum vmpressure_levels { + VMPRESSURE_LOW = 0, + VMPRESSURE_MEDIUM, + VMPRESSURE_CRITICAL, + VMPRESSURE_NUM_LEVELS, +}; + +static const char * const vmpressure_str_levels[] = { + [VMPRESSURE_LOW] = "low", + [VMPRESSURE_MEDIUM] = "medium", + [VMPRESSURE_CRITICAL] = "critical", +}; + +static enum vmpressure_levels vmpressure_level(unsigned long pressure) +{ + if (pressure >= vmpressure_level_critical) + return VMPRESSURE_CRITICAL; + else if (pressure >= vmpressure_level_med) + return VMPRESSURE_MEDIUM; + return VMPRESSURE_LOW; +} + +static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long scale = scanned + reclaimed; + unsigned long pressure; + + /* + * We calculate the ratio (in percents) of how many pages were + * scanned vs. reclaimed in a given time frame (window). Note that + * time is in VM reclaimer's "ticks", i.e. number of pages + * scanned. This makes it possible to set desired reaction time + * and serves as a ratelimit. + */ + pressure = scale - (reclaimed * scale / scanned); + pressure = pressure * 100 / scale; + + pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, + scanned, reclaimed); + + return vmpressure_level(pressure); +} + +struct vmpressure_event { + struct eventfd_ctx *efd; + enum vmpressure_levels level; + struct list_head node; +}; + +static bool vmpressure_event(struct vmpressure *vmpr, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure_event *ev; + enum vmpressure_levels level; + bool signalled = false; + + level = vmpressure_calc_level(scanned, reclaimed); + + mutex_lock(&vmpr->events_lock); + + list_for_each_entry(ev, &vmpr->events, node) { + if (level >= ev->level) { + eventfd_signal(ev->efd, 1); + signalled = true; + } + } + + mutex_unlock(&vmpr->events_lock); + + return signalled; +} + +static void vmpressure_work_fn(struct work_struct *work) +{ + struct vmpressure *vmpr = work_to_vmpressure(work); + unsigned long scanned; + unsigned long reclaimed; + + /* + * Several contexts might be calling vmpressure(), so it is + * possible that the work was rescheduled again before the old + * work context cleared the counters. In that case we will run + * just after the old work returns, but then scanned might be zero + * here. No need for any locks here since we don't care if + * vmpr->reclaimed is in sync. + */ + if (!vmpr->scanned) + return; + + mutex_lock(&vmpr->sr_lock); + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + vmpr->scanned = 0; + vmpr->reclaimed = 0; + mutex_unlock(&vmpr->sr_lock); + + do { + if (vmpressure_event(vmpr, scanned, reclaimed)) + break; + /* + * If not handled, propagate the event upward into the + * hierarchy. + */ + } while ((vmpr = vmpressure_parent(vmpr))); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + + /* + * Here we only want to account pressure that userland is able to + * help us with. For example, suppose that DMA zone is under + * pressure; if we notify userland about that kind of pressure, + * then it will be mostly a waste as it will trigger unnecessary + * freeing of memory by userland (since userland is more likely to + * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That + * is why we include only movable, highmem and FS/IO pages. + * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so + * we account it too. + */ + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + /* + * If we got here with no pages scanned, then that is an indicator + * that reclaimer was unable to find any shrinkable LRUs at the + * current scanning depth. But it does not mean that we should + * report the critical pressure, yet. If the scanning priority + * (scanning depth) goes too high (deep), we will be notified + * through vmpressure_prio(). But so far, keep calm. + */ + if (!scanned) + return; + + mutex_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + scanned = vmpr->scanned; + mutex_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win || work_pending(&vmpr->work)) + return; + schedule_work(&vmpr->work); +} + +/** + * vmpressure_prio() - Account memory pressure through reclaimer priority level + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @prio: reclaimer's priority + * + * This function should be called from the reclaim path every time when + * the vmscan's reclaiming priority (scanning depth) changes. + * + * This function does not return any value. + */ +void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) +{ + /* + * We only use prio for accounting critical level. For more info + * see comment for vmpressure_level_critical_prio variable above. + */ + if (prio > vmpressure_level_critical_prio) + return; + + /* + * OK, the prio is below the threshold, updating vmpressure + * information before shrinker dives into long shrinking of long + * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 + * to the vmpressure() basically means that we signal 'critical' + * level. + */ + vmpressure(gfp, memcg, vmpressure_win, 0); +} + +/** + * vmpressure_register_event() - Bind vmpressure notifications to an eventfd + * @cg: cgroup that is interested in vmpressure notifications + * @cft: cgroup control files handle + * @eventfd: eventfd context to link notifications with + * @args: event arguments (used to set up a pressure level threshold) + * + * This function associates eventfd context with the vmpressure + * infrastructure, so that the notifications will be delivered to the + * @eventfd. The @args parameter is a string that denotes pressure level + * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or + * "critical"). + * + * This function should not be used directly, just pass it to (struct + * cftype).register_event, and then cgroup core will handle everything by + * itself. + */ +int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd, const char *args) +{ + struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure_event *ev; + int level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { + if (!strcmp(vmpressure_str_levels[level], args)) + break; + } + + if (level >= VMPRESSURE_NUM_LEVELS) + return -EINVAL; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->efd = eventfd; + ev->level = level; + + mutex_lock(&vmpr->events_lock); + list_add(&ev->node, &vmpr->events); + mutex_unlock(&vmpr->events_lock); + + return 0; +} + +/** + * vmpressure_unregister_event() - Unbind eventfd from vmpressure + * @cg: cgroup handle + * @cft: cgroup control files handle + * @eventfd: eventfd context that was used to link vmpressure with the @cg + * + * This function does internal manipulations to detach the @eventfd from + * the vmpressure notifications, and then frees internal resources + * associated with the @eventfd (but the @eventfd itself is not freed). + * + * This function should not be used directly, just pass it to (struct + * cftype).unregister_event, and then cgroup core will handle everything + * by itself. + */ +void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd) +{ + struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure_event *ev; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ev->efd != eventfd) + continue; + list_del(&ev->node); + kfree(ev); + break; + } + mutex_unlock(&vmpr->events_lock); +} + +/** + * vmpressure_init() - Initialize vmpressure control structure + * @vmpr: Structure to be initialized + * + * This function should be called on every allocated vmpressure structure + * before any usage. + */ +void vmpressure_init(struct vmpressure *vmpr) +{ + mutex_init(&vmpr->sr_lock); + mutex_init(&vmpr->events_lock); + INIT_LIST_HEAD(&vmpr->events); + INIT_WORK(&vmpr->work, vmpressure_work_fn); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index adc7e9058181..2cff0d491c6d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> @@ -128,7 +129,7 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -long vm_total_pages; /* The total number of pages which the VM controls */ +unsigned long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -545,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) void putback_lru_page(struct page *page) { int lru; - int active = !!TestClearPageActive(page); int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -560,8 +560,8 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_lru_base_type(page); - lru_cache_add_lru(page, lru); + lru = page_lru_base_type(page); + lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable @@ -668,6 +668,35 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -676,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); @@ -695,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct page *page; int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); @@ -722,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * memcg doesn't have any dirty pages throttling so we - * could easily OOM just because too many pages are in - * writeback and there is nothing else to reclaim. - * - * Check __GFP_IO, certainly because a loop driver - * thread might enter reclaim, and deadlock if it waits - * on a page for which it is needed to do the write - * (loop masks off __GFP_IO|__GFP_FS for this reason); - * but more thought would probably show more reasons. - * - * Don't require __GFP_FS, since we're not going into - * the FS, just waiting on its writeback completion. - * Worryingly, ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so - * testing may_enter_fs here is liable to OOM on them. - */ - if (global_reclaim(sc) || + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + nr_immediate++; + goto keep_locked; + + /* Case 2 above */ + } else if (global_reclaim(sc) || !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { /* * This is slightly racy - end_page_writeback() @@ -755,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); } - wait_on_page_writeback(page); } if (!force_reclaim) @@ -780,12 +871,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageAnon(page) && !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page)) + if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; - } - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more @@ -805,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - nr_dirty++; - /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && (!current_is_kswapd() || - sc->priority >= DEF_PRIORITY - 2)) { + !zone_is_reclaim_dirty(zone))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -837,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: - nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -945,22 +1034,16 @@ keep: VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } - /* - * Tag a zone as congested if all the dirty pages encountered were - * backed by a congested BDI. In this case, reclaimers should just - * back off and wait for congestion to clear because further reclaim - * will encounter the same problem - */ - if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(zone, ZONE_CONGESTED); - free_hot_cold_page_list(&free_pages, 1); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); mem_cgroup_uncharge_end(); *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } @@ -972,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; LIST_HEAD(clean_pages); @@ -984,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; @@ -1280,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_reclaimed = 0; unsigned long nr_taken; unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct zone *zone = lruvec_zone(lruvec); @@ -1322,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_dirty, &nr_writeback, false); + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); spin_lock_irq(&zone->lru_lock); @@ -1355,21 +1443,51 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * This scales the number of dirty pages that must be under writeback - * before throttling depending on priority. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + if (nr_writeback && nr_writeback == nr_taken) + zone_set_flag(zone, ZONE_WRITEBACK); + + /* + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim + */ + if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. It will forcibly stall in the + * next check. + */ + if (nr_unqueued_dirty == nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + + /* + * In addition, if kswapd scans pages marked marked for + * immediate reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_unqueued_dirty == nr_taken || nr_immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1579,16 +1697,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) } #endif -static int inactive_file_is_low_global(struct zone *zone) -{ - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_FILE); - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - - return (active > inactive); -} - /** * inactive_file_is_low - check if file pages need to be deactivated * @lruvec: LRU vector to check @@ -1605,10 +1713,13 @@ static int inactive_file_is_low_global(struct zone *zone) */ static int inactive_file_is_low(struct lruvec *lruvec) { - if (!mem_cgroup_disabled()) - return mem_cgroup_inactive_file_is_low(lruvec); + unsigned long inactive; + unsigned long active; + + inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = get_lru_size(lruvec, LRU_ACTIVE_FILE); - return inactive_file_is_low_global(lruvec_zone(lruvec)); + return active > inactive; } static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) @@ -1638,6 +1749,13 @@ static int vmscan_swappiness(struct scan_control *sc) return mem_cgroup_swappiness(sc->target_mem_cgroup); } +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -1650,15 +1768,16 @@ static int vmscan_swappiness(struct scan_control *sc) static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { - unsigned long anon, file, free; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + u64 fraction[2]; + u64 denominator = 0; /* gcc */ + struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; + unsigned long anon, file, free; + bool force_scan = false; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - u64 fraction[2], denominator; enum lru_list lru; - int noswap = 0; - bool force_scan = false; - struct zone *zone = lruvec_zone(lruvec); /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1676,11 +1795,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - fraction[0] = 0; - fraction[1] = 1; - denominator = 1; + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && vmscan_swappiness(sc)) { + scan_balance = SCAN_EQUAL; goto out; } @@ -1689,30 +1827,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + get_lru_size(lruvec, LRU_INACTIVE_FILE); + /* + * If it's foreseeable that reclaiming the file cache won't be + * enough to get the zone back into a desirable shape, we have + * to swap. Better start now and leave the - probably heavily + * thrashing - remaining file pages alone. + */ if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + free = zone_page_state(zone, NR_FREE_PAGES); if (unlikely(file + free <= high_wmark_pages(zone))) { - /* - * If we have very few page cache pages, force-scan - * anon pages. - */ - fraction[0] = 1; - fraction[1] = 0; - denominator = 1; - goto out; - } else if (!inactive_file_is_low_global(zone)) { - /* - * There is enough inactive page cache, do not - * reclaim anything from the working set right now. - */ - fraction[0] = 0; - fraction[1] = 1; - denominator = 1; + scan_balance = SCAN_ANON; goto out; } } /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ + if (!inactive_file_is_low(lruvec)) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ @@ -1759,19 +1899,143 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); + unsigned long size; unsigned long scan; - scan = get_lru_size(lruvec, lru); - if (sc->priority || noswap || !vmscan_swappiness(sc)) { - scan >>= sc->priority; - if (!scan && force_scan) - scan = SWAP_CLUSTER_MAX; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ scan = div64_u64(scan * fraction[file], denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); } nr[lru] = scan; } } +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted = false; + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + + /* + * For global direct reclaim, reclaim only the number of pages + * requested. Less care is taken to scan proportionally as it + * is more important to minimise direct reclaim stall latency + * than it is to properly age the LRU lists. + */ + if (global_reclaim(sc) && !current_is_kswapd()) + break; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs shrink + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + throttle_vm_writeout(sc->gfp_mask); +} + /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { @@ -1790,7 +2054,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_zone() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct lruvec *lruvec, +static inline bool should_continue_reclaim(struct zone *zone, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -1830,15 +2094,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); - if (nr_swap_pages > 0) - inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); + inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { + switch (compaction_suitable(zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -1847,98 +2111,53 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, } } -/* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long nr[NR_LRU_LISTS]; - unsigned long nr_to_scan; - enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; - unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct blk_plug plug; - -restart: - nr_reclaimed = 0; - nr_scanned = sc->nr_scanned; - get_scan_count(lruvec, sc, nr); - - blk_start_plug(&plug); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || - nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(lru) { - if (nr[lru]) { - nr_to_scan = min_t(unsigned long, - nr[lru], SWAP_CLUSTER_MAX); - nr[lru] -= nr_to_scan; - nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); - } - } - /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. - */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) - break; - } - blk_finish_plug(&plug); - sc->nr_reclaimed += nr_reclaimed; + do { + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = sc->priority, + }; + struct mem_cgroup *memcg; - /* - * Even if we did not try to evict anon pages at all, we want to - * rebalance the anon lru active/inactive ratio. - */ - if (inactive_anon_is_low(lruvec)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(lruvec, nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)) - goto restart; + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct lruvec *lruvec; - throttle_vm_writeout(sc->gfp_mask); -} + lruvec = mem_cgroup_zone_lruvec(zone, memcg); -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = sc->priority, - }; - struct mem_cgroup *memcg; + shrink_lruvec(lruvec, sc); - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + /* + * Direct reclaim and kswapd have to scan all memory + * cgroups to fulfill the overall scan target for the + * zone. + * + * Limit reclaim, on the other hand, only cares about + * nr_to_reclaim pages to be reclaimed and it will + * retry with decreasing priority if one round over the + * whole hierarchy is not sufficient. + */ + if (!global_reclaim(sc) && + sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); - shrink_lruvec(lruvec, sc); + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); - /* - * Limit reclaim has historically picked one memcg and - * scanned it with decreasing priority levels until - * nr_to_reclaim had been reclaimed. This priority - * cycle is thus over after a single memcg. - * - * Direct reclaim and kswapd, on the other hand, have - * to scan all memory cgroups to fulfill the overall - * scan target for the zone. - */ - if (!global_reclaim(sc)) { - mem_cgroup_iter_break(root, memcg); - break; - } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)); } /* Returns true if compaction should go ahead for a high-order request */ @@ -1958,7 +2177,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * a reasonable chance of completing and allocating the page */ balance_gap = min(low_wmark_pages(zone), - (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2122,12 +2341,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, count_vm_event(ALLOCSTALL); do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; aborted_reclaim = shrink_zones(zonelist, sc); /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups + * Don't shrink slabs when reclaiming memory from over limit + * cgroups but do shrink slab at least once when aborting + * reclaim for compaction to avoid unevenly scanning file/anon + * LRU pages over slab pages. */ if (global_reclaim(sc)) { unsigned long lru_pages = 0; @@ -2150,6 +2373,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, goto out; /* + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. + */ + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; + + /* * Try to write back as many pages as we just scanned. This * tends to cause slow streaming writers to write data to the * disk smoothly, at the dirtying rate, which is nice. But @@ -2162,18 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - - /* Take a nap, wait for some writeback to complete */ - if (!sc->hibernation_mode && sc->nr_scanned && - sc->priority < DEF_PRIORITY - 2) { - struct zone *preferred_zone; - - first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - &cpuset_current_mems_allowed, - &preferred_zone); - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); - } - } while (--sc->priority >= 0); + } while (--sc->priority >= 0 && !aborted_reclaim); out: delayacct_freepages_end(); @@ -2300,7 +2519,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, @@ -2452,12 +2671,16 @@ static bool zone_balanced(struct zone *zone, int order, } /* - * pgdat_balanced is used when checking if a node is balanced for high-order - * allocations. Only zones that meet watermarks and are in a zone allowed - * by the callers classzone_idx are added to balanced_pages. The total of - * balanced pages must be at least 25% of the zones allowed by classzone_idx - * for the node to be considered balanced. Forcing all zones to be balanced - * for high orders can cause excessive reclaim when there are imbalanced zones. + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. * The choice of 25% is due to * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine @@ -2467,17 +2690,43 @@ static bool zone_balanced(struct zone *zone, int order, * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. */ -static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, - int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long present_pages = 0; + unsigned long managed_pages = 0; + unsigned long balanced_pages = 0; int i; - for (i = 0; i <= classzone_idx; i++) - present_pages += pgdat->node_zones[i].present_pages; + /* Check the watermark levels */ + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + managed_pages += zone->managed_pages; + + /* + * A special case here: + * + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well! + */ + if (zone->all_unreclaimable) { + balanced_pages += zone->managed_pages; + continue; + } + + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->managed_pages; + else if (!order) + return false; + } - /* A special case here: if zone has no page, we think it's balanced */ - return balanced_pages >= (present_pages >> 2); + if (order) + return balanced_pages >= (managed_pages >> 2); + else + return true; } /* @@ -2489,10 +2738,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, int classzone_idx) { - int i; - unsigned long balanced = 0; - bool all_zones_ok = true; - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) return false; @@ -2511,39 +2756,92 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, return false; } - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; + return pgdat_balanced(pgdat, order, classzone_idx); +} - if (!populated_zone(zone)) - continue; +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long lru_pages, + unsigned long *nr_attempted) +{ + unsigned long nr_slab; + int testorder = sc->order; + unsigned long balance_gap; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + bool lowmem_pressure; - /* - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well if kswapd - * is to sleep - */ - if (zone->all_unreclaimable) { - balanced += zone->present_pages; - continue; - } + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); - if (!zone_balanced(zone, order, 0, i)) - all_zones_ok = false; - else - balanced += zone->present_pages; - } + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; /* - * For high-order requests, the balanced zones must contain at least - * 25% of the nodes pages for kswapd to sleep. For order-0, all zones - * must be balanced + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. */ - if (order) - return pgdat_balanced(pgdat, balanced, classzone_idx); - else - return all_zones_ok; + balance_gap = min(low_wmark_pages(zone), + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc); + + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; + + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (!zone->all_unreclaimable && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; } /* @@ -2570,42 +2868,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - struct zone *unbalanced_zone; - unsigned long balanced; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long total_scanned; - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, + .may_writepage = !laptop_mode, .order = order, .target_mem_cgroup = NULL, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; -loop_again: - total_scanned = 0; - sc.priority = DEF_PRIORITY; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); do { unsigned long lru_pages = 0; - int has_under_min_watermark_zone = 0; + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); - unbalanced_zone = NULL; - balanced = 0; + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2642,20 +2926,46 @@ loop_again: end_zone = i; break; } else { - /* If balanced, clear the congested flag */ + /* + * If balanced, clear the dirty and congested + * flags + */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } } + if (i < 0) goto out; for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -2666,8 +2976,6 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2686,85 +2994,16 @@ loop_again: order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; - total_scanned += nr_soft_scanned; /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), - (zone->present_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - testorder = order; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - shrink_zone(zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } - - /* - * If we've done a decent amount of scanning and - * the reclaim ratio is low, start doing writepage - * even in laptop mode - */ - if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) - sc.may_writepage = 1; - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (!zone_balanced(zone, testorder, 0, end_zone)) { - unbalanced_zone = zone; - /* - * We are still under min water mark. This - * means that we have a GFP_ATOMIC allocation - * failure risk. Hurry up! - */ - if (!zone_watermark_ok_safe(zone, order, - min_wmark_pages(zone), end_zone, 0)) - has_under_min_watermark_zone = 1; - } else { - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * speculatively avoid congestion waits - */ - zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; - } - + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* @@ -2776,87 +3015,38 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) - break; /* kswapd: all done */ /* - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { - if (has_under_min_watermark_zone) - count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); - else - wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); - } + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; - /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. - */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) break; - } while (--sc.priority >= 0); -out: - - /* - * order-0: All zones must meet high watermark for a balanced node - * high-order: Balanced zones must make up at least 25% of the node - * for the node to be balanced - */ - if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) { - cond_resched(); - - try_to_freeze(); /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) - order = sc.order = 0; - - goto loop_again; - } - - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - } - - if (zones_need_compaction) + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) compact_pgdat(pgdat, order); - } + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, @@ -3068,7 +3258,7 @@ unsigned long global_reclaimable_pages(void) nr = global_page_state(NR_ACTIVE_FILE) + global_page_state(NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) nr += global_page_state(NR_ACTIVE_ANON) + global_page_state(NR_INACTIVE_ANON); @@ -3082,7 +3272,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); - if (nr_swap_pages > 0) + if (get_nr_swap_pages() > 0) nr += zone_page_state(zone, NR_ACTIVE_ANON) + zone_page_state(zone, NR_INACTIVE_ANON); @@ -3137,8 +3327,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { int nid; @@ -3173,9 +3363,9 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - pgdat->kswapd = NULL; pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); + pgdat->kswapd = NULL; } return ret; } @@ -3295,9 +3485,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .nr_to_reclaim = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 9800306c8195..f42745e65780 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret) } EXPORT_SYMBOL_GPL(all_vm_events); -#ifdef CONFIG_HOTPLUG /* * Fold the foreign cpu events into our own. * @@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu) fold_state->event[i] = 0; } } -#endif /* CONFIG_HOTPLUG */ #endif /* CONFIG_VM_EVENT_COUNTERS */ @@ -142,7 +140,7 @@ int calculate_normal_threshold(struct zone *zone) * 125 1024 10 16-32 GB 9 */ - mem = zone->present_pages >> (27 - PAGE_SHIFT); + mem = zone->managed_pages >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); @@ -495,6 +493,10 @@ void refresh_cpu_vm_stats(int cpu) atomic_long_add(global_diff[i], &vm_stat[i]); } +/* + * this is only called if !populated_zone(zone), which implies no other users of + * pset->vm_stat_diff[] exsist. + */ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { int i; @@ -628,7 +630,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { #ifdef CONFIG_CMA "CMA", #endif +#ifdef CONFIG_MEMORY_ISOLATION "Isolate", +#endif }; static void *frag_start(struct seq_file *m, loff_t *pos) @@ -768,7 +772,6 @@ const char * const vmstat_text[] = { "kswapd_inodesteal", "kswapd_low_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly", - "kswapd_skip_congestion_wait", "pageoutrun", "allocstall", @@ -890,7 +893,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, int mtype; unsigned long pfn; unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = start_pfn + zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000000..9bb4710e3589 --- /dev/null +++ b/mm/zbud.c @@ -0,0 +1,527 @@ +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justifed" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/zbud.h> + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64, and there + * will be 64 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + struct zbud_ops *ops; +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(int size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = 0; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. -1 at the + * end for the zbud header. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kmalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL is the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (size <= 0 || gfp & __GFP_HIGHMEM) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +#define list_tail_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zbud(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zswap.c b/mm/zswap.c new file mode 100644 index 000000000000..deda2b671e12 --- /dev/null +++ b/mm/zswap.c @@ -0,0 +1,943 @@ +/* + * zswap.c - zswap driver file + * + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. + * + * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/frontswap.h> +#include <linux/rbtree.h> +#include <linux/swap.h> +#include <linux/crypto.h> +#include <linux/mempool.h> +#include <linux/zbud.h> + +#include <linux/mm_types.h> +#include <linux/page-flags.h> +#include <linux/swapops.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> + +/********************************* +* statistics +**********************************/ +/* Number of memory pages used by the compressed pool */ +static u64 zswap_pool_pages; +/* The number of compressed pages currently stored in zswap */ +static atomic_t zswap_stored_pages = ATOMIC_INIT(0); + +/* + * The statistics below are not protected from concurrent access for + * performance reasons so they may not be a 100% accurate. However, + * they do provide useful information on roughly how many times a + * certain event is occurring. +*/ + +/* Pool limit was hit (see zswap_max_pool_percent) */ +static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ +static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ +static u64 zswap_reject_compress_poor; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ +static u64 zswap_reject_kmemcache_fail; +/* Duplicate store was encountered (rare) */ +static u64 zswap_duplicate_entry; + +/********************************* +* tunables +**********************************/ +/* Enable/disable zswap (disabled by default, fixed at boot for now) */ +static bool zswap_enabled __read_mostly; +module_param_named(enabled, zswap_enabled, bool, 0); + +/* Compressor to be used by zswap (fixed at boot for now) */ +#define ZSWAP_COMPRESSOR_DEFAULT "lzo" +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +module_param_named(compressor, zswap_compressor, charp, 0); + +/* The maximum percentage of memory that the compressed pool can occupy */ +static unsigned int zswap_max_pool_percent = 20; +module_param_named(max_pool_percent, + zswap_max_pool_percent, uint, 0644); + +/********************************* +* compression functions +**********************************/ +/* per-cpu compression transforms */ +static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; + +enum comp_op { + ZSWAP_COMPOP_COMPRESS, + ZSWAP_COMPOP_DECOMPRESS +}; + +static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZSWAP_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZSWAP_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zswap_comp_init(void) +{ + if (!crypto_has_comp(zswap_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zswap_compressor); + /* fall back to default compressor */ + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zswap_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zswap_compressor); + + /* alloc percpu transforms */ + zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zswap_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void zswap_comp_exit(void) +{ + /* free percpu transforms */ + if (zswap_comp_pcpu_tfms) + free_percpu(zswap_comp_pcpu_tfms); +} + +/********************************* +* data structures +**********************************/ +/* + * struct zswap_entry + * + * This structure contains the metadata for tracking a single compressed + * page within zswap. + * + * rbnode - links the entry into red-black tree for the appropriate swap type + * refcount - the number of outstanding reference to the entry. This is needed + * to protect against premature freeing of the entry by code + * concurent calls to load, invalidate, and writeback. The lock + * for the zswap_tree structure that contains the entry must + * be held while changing the refcount. Since the lock must + * be held, there is no reason to also make refcount atomic. + * offset - the swap offset for the entry. Index into the red-black tree. + * handle - zsmalloc allocation handle that stores the compressed page data + * length - the length in bytes of the compressed page data. Needed during + * decompression + */ +struct zswap_entry { + struct rb_node rbnode; + pgoff_t offset; + int refcount; + unsigned int length; + unsigned long handle; +}; + +struct zswap_header { + swp_entry_t swpentry; +}; + +/* + * The tree lock in the zswap_tree struct protects a few things: + * - the rbtree + * - the refcount field of each entry in the tree + */ +struct zswap_tree { + struct rb_root rbroot; + spinlock_t lock; + struct zbud_pool *pool; +}; + +static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static int zswap_entry_cache_create(void) +{ + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return (zswap_entry_cache == NULL); +} + +static void zswap_entry_cache_destory(void) +{ + kmem_cache_destroy(zswap_entry_cache); +} + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc(zswap_entry_cache, gfp); + if (!entry) + return NULL; + entry->refcount = 1; + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock */ +static int zswap_entry_put(struct zswap_entry *entry) +{ + entry->refcount--; + return entry->refcount; +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + if (entry->offset > offset) + node = node->rb_left; + else if (entry->offset < offset) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + if (myentry->offset > entry->offset) + link = &(*link)->rb_left; + else if (myentry->offset < entry->offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} + +/********************************* +* per-cpu code +**********************************/ +static DEFINE_PER_CPU(u8 *, zswap_dstmem); + +static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zswap_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zswap_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zswap_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + return __zswap_cpu_notifier(action, cpu); +} + +static struct notifier_block zswap_cpu_notifier_block = { + .notifier_call = zswap_cpu_notifier +}; + +static int zswap_cpu_init(void) +{ + unsigned long cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + register_cpu_notifier(&zswap_cpu_notifier_block); + put_online_cpus(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); + put_online_cpus(); + return -ENOMEM; +} + +/********************************* +* helpers +**********************************/ +static bool zswap_is_full(void) +{ + return (totalram_pages * zswap_max_pool_percent / 100 < + zswap_pool_pages); +} + +/* + * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) +{ + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); +} + +/********************************* +* writeback code +**********************************/ +/* return enum for zswap_get_swap_cache_page */ +enum zswap_get_swap_ret { + ZSWAP_SWAPCACHE_NEW, + ZSWAP_SWAPCACHE_EXIST, + ZSWAP_SWAPCACHE_NOMEM +}; + +/* + * zswap_get_swap_cache_page + * + * This is an adaption of read_swap_cache_async() + * + * This function tries to find a page with the given swap entry + * in the swapper_space address space (the swap cache). If the page + * is found, it is returned in retpage. Otherwise, a page is allocated, + * added to the swap cache, and returned in retpage. + * + * If success, the swap cache page is returned in retpage + * Returns 0 if page was already in the swap cache, page is not locked + * Returns 1 if the new page needs to be populated, page is locked + * Returns <0 on error + */ +static int zswap_get_swap_cache_page(swp_entry_t entry, + struct page **retpage) +{ + struct page *found_page, *new_page = NULL; + struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; + int err; + + *retpage = NULL; + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page(GFP_KERNEL); + if (!new_page) + break; /* Out of memory */ + } + + /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(GFP_KERNEL); + if (err) + break; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); + continue; + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); + break; + } + + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = __add_to_swap_cache(new_page, entry); + if (likely(!err)) { + radix_tree_preload_end(); + lru_cache_add_anon(new_page); + *retpage = new_page; + return ZSWAP_SWAPCACHE_NEW; + } + radix_tree_preload_end(); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry, NULL); + } while (err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + if (!found_page) + return ZSWAP_SWAPCACHE_NOMEM; + *retpage = found_page; + return ZSWAP_SWAPCACHE_EXIST; +} + +/* + * Attempts to free an entry by adding a page to the swap cache, + * decompressing the entry data into the page, and issuing a + * bio write to write the page back to the swap device. + * + * This can be thought of as a "resumed writeback" of the page + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the frontswap_store() + * in the first place. After the page has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +{ + struct zswap_header *zhdr; + swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; + u8 *src, *dst; + unsigned int dlen; + int ret, refcount; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + /* extract swpentry from data */ + zhdr = zbud_map(pool, handle); + swpentry = zhdr->swpentry; /* here */ + zbud_unmap(pool, handle); + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + BUG_ON(pool != tree->pool); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + return 0; + } + zswap_entry_get(entry); + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); + + /* try to allocate swap cache page */ + switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ + ret = -ENOMEM; + goto fail; + + case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ + /* page is already in the swap cache, ignore for now */ + page_cache_release(page); + ret = -EEXIST; + goto fail; + + case ZSWAP_SWAPCACHE_NEW: /* page is locked */ + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zbud_map(tree->pool, entry->handle) + + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, + entry->length, dst, &dlen); + kunmap_atomic(dst); + zbud_unmap(tree->pool, entry->handle); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* page is up to date */ + SetPageUptodate(page); + } + + /* start writeback */ + __swap_writepage(page, &wbc, end_swap_bio_write); + page_cache_release(page); + zswap_written_back_pages++; + + spin_lock(&tree->lock); + + /* drop local reference */ + zswap_entry_put(entry); + /* drop the initial reference from entry creation */ + refcount = zswap_entry_put(entry); + + /* + * There are three possible values for refcount here: + * (1) refcount is 1, load is in progress, unlink from rbtree, + * load will free + * (2) refcount is 0, (normal case) entry is valid, + * remove from rbtree and free entry + * (3) refcount is -1, invalidate happened during writeback, + * free entry + */ + if (refcount >= 0) { + /* no invalidate yet, remove from rbtree */ + rb_erase(&entry->rbnode, &tree->rbroot); + } + spin_unlock(&tree->lock); + if (refcount <= 0) { + /* free the entry */ + zswap_free_entry(tree, entry); + return 0; + } + return -EAGAIN; + +fail: + spin_lock(&tree->lock); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + return ret; +} + +/********************************* +* frontswap hooks +**********************************/ +/* attempts to compress and store an single page */ +static int zswap_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *dupentry; + int ret; + unsigned int dlen = PAGE_SIZE, len; + unsigned long handle; + char *buf; + u8 *src, *dst; + struct zswap_header *zhdr; + + if (!tree) { + ret = -ENODEV; + goto reject; + } + + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + if (zbud_reclaim_page(tree->pool, 8)) { + zswap_reject_reclaim_fail++; + ret = -ENOMEM; + goto reject; + } + } + + /* allocate entry */ + entry = zswap_entry_cache_alloc(GFP_KERNEL); + if (!entry) { + zswap_reject_kmemcache_fail++; + ret = -ENOMEM; + goto reject; + } + + /* compress */ + dst = get_cpu_var(zswap_dstmem); + src = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); + kunmap_atomic(src); + if (ret) { + ret = -EINVAL; + goto freepage; + } + + /* store */ + len = dlen + sizeof(struct zswap_header); + ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, + &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto freepage; + } + if (ret) { + zswap_reject_alloc_fail++; + goto freepage; + } + zhdr = zbud_map(tree->pool, handle); + zhdr->swpentry = swp_entry(type, offset); + buf = (u8 *)(zhdr + 1); + memcpy(buf, dst, dlen); + zbud_unmap(tree->pool, handle); + put_cpu_var(zswap_dstmem); + + /* populate entry */ + entry->offset = offset; + entry->handle = handle; + entry->length = dlen; + + /* map */ + spin_lock(&tree->lock); + do { + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); + if (ret == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + rb_erase(&dupentry->rbnode, &tree->rbroot); + if (!zswap_entry_put(dupentry)) { + /* free */ + zswap_free_entry(tree, dupentry); + } + } + } while (ret == -EEXIST); + spin_unlock(&tree->lock); + + /* update stats */ + atomic_inc(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); + + return 0; + +freepage: + put_cpu_var(zswap_dstmem); + zswap_entry_cache_free(entry); +reject: + return ret; +} + +/* + * returns 0 if the page was successfully decompressed + * return -1 on entry not found or error +*/ +static int zswap_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + u8 *src, *dst; + unsigned int dlen; + int refcount, ret; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return -1; + } + zswap_entry_get(entry); + spin_unlock(&tree->lock); + + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zbud_map(tree->pool, entry->handle) + + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, + dst, &dlen); + kunmap_atomic(dst); + zbud_unmap(tree->pool, entry->handle); + BUG_ON(ret); + + spin_lock(&tree->lock); + refcount = zswap_entry_put(entry); + if (likely(refcount)) { + spin_unlock(&tree->lock); + return 0; + } + spin_unlock(&tree->lock); + + /* + * We don't have to unlink from the rbtree because + * zswap_writeback_entry() or zswap_frontswap_invalidate page() + * has already done this for us if we are the last reference. + */ + /* free */ + + zswap_free_entry(tree, entry); + + return 0; +} + +/* frees an entry in zswap */ +static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + int refcount; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return; + } + + /* remove from rbtree */ + rb_erase(&entry->rbnode, &tree->rbroot); + + /* drop the initial reference from entry creation */ + refcount = zswap_entry_put(entry); + + spin_unlock(&tree->lock); + + if (refcount) { + /* writeback in progress, writeback will free */ + return; + } + + /* free */ + zswap_free_entry(tree, entry); +} + +/* frees all zswap entries for the given swap type */ +static void zswap_frontswap_invalidate_area(unsigned type) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct rb_node *node; + struct zswap_entry *entry; + + if (!tree) + return; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + /* + * TODO: Even though this code should not be executed because + * the try_to_unuse() in swapoff should have emptied the tree, + * it is very wasteful to rebalance the tree after every + * removal when we are freeing the whole tree. + * + * If post-order traversal code is ever added to the rbtree + * implementation, it should be used here. + */ + while ((node = rb_first(&tree->rbroot))) { + entry = rb_entry(node, struct zswap_entry, rbnode); + rb_erase(&entry->rbnode, &tree->rbroot); + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + } + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); +} + +static struct zbud_ops zswap_zbud_ops = { + .evict = zswap_writeback_entry +}; + +static void zswap_frontswap_init(unsigned type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + if (!tree) + goto err; + tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + if (!tree->pool) + goto freetree; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; + return; + +freetree: + kfree(tree); +err: + pr_err("alloc failed, zswap disabled for swap type %d\n", type); +} + +static struct frontswap_ops zswap_frontswap_ops = { + .store = zswap_frontswap_store, + .load = zswap_frontswap_load, + .invalidate_page = zswap_frontswap_invalidate_page, + .invalidate_area = zswap_frontswap_invalidate_area, + .init = zswap_frontswap_init +}; + +/********************************* +* debugfs functions +**********************************/ +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> + +static struct dentry *zswap_debugfs_root; + +static int __init zswap_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zswap_debugfs_root = debugfs_create_dir("zswap", NULL); + if (!zswap_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, + zswap_debugfs_root, &zswap_pool_limit_hit); + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_alloc_fail); + debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_poor", S_IRUGO, + zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("written_back_pages", S_IRUGO, + zswap_debugfs_root, &zswap_written_back_pages); + debugfs_create_u64("duplicate_entry", S_IRUGO, + zswap_debugfs_root, &zswap_duplicate_entry); + debugfs_create_u64("pool_pages", S_IRUGO, + zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_atomic_t("stored_pages", S_IRUGO, + zswap_debugfs_root, &zswap_stored_pages); + + return 0; +} + +static void __exit zswap_debugfs_exit(void) +{ + debugfs_remove_recursive(zswap_debugfs_root); +} +#else +static int __init zswap_debugfs_init(void) +{ + return 0; +} + +static void __exit zswap_debugfs_exit(void) { } +#endif + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + if (!zswap_enabled) + return 0; + + pr_info("loading zswap\n"); + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto error; + } + if (zswap_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zswap_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + frontswap_register_ops(&zswap_frontswap_ops); + if (zswap_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + return 0; +pcpufail: + zswap_comp_exit(); +compfail: + zswap_entry_cache_destory(); +error: + return -ENOMEM; +} +/* must be late so crypto has time to come up */ +late_initcall(init_zswap); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("Compressed cache for swap pages"); |