diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 25 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 651 | ||||
-rw-r--r-- | mm/bootmem.c | 13 | ||||
-rw-r--r-- | mm/cleancache.c | 276 | ||||
-rw-r--r-- | mm/cma.c | 62 | ||||
-rw-r--r-- | mm/cma.h | 24 | ||||
-rw-r--r-- | mm/cma_debug.c | 206 | ||||
-rw-r--r-- | mm/compaction.c | 75 | ||||
-rw-r--r-- | mm/debug.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 187 | ||||
-rw-r--r-- | mm/frontswap.c | 215 | ||||
-rw-r--r-- | mm/gup.c | 128 | ||||
-rw-r--r-- | mm/huge_memory.c | 140 | ||||
-rw-r--r-- | mm/hugetlb.c | 443 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 15 | ||||
-rw-r--r-- | mm/internal.h | 19 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 13 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 1 | ||||
-rw-r--r-- | mm/kmemleak.c | 171 | ||||
-rw-r--r-- | mm/ksm.c | 10 | ||||
-rw-r--r-- | mm/madvise.c | 1 | ||||
-rw-r--r-- | mm/memblock.c | 179 | ||||
-rw-r--r-- | mm/memcontrol.c | 527 | ||||
-rw-r--r-- | mm/memory-failure.c | 351 | ||||
-rw-r--r-- | mm/memory.c | 469 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 52 | ||||
-rw-r--r-- | mm/mempolicy.c | 44 | ||||
-rw-r--r-- | mm/mempool.c | 127 | ||||
-rw-r--r-- | mm/memtest.c | 119 | ||||
-rw-r--r-- | mm/migrate.c | 55 | ||||
-rw-r--r-- | mm/mlock.c | 131 | ||||
-rw-r--r-- | mm/mm_init.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 31 | ||||
-rw-r--r-- | mm/mprotect.c | 11 | ||||
-rw-r--r-- | mm/mremap.c | 40 | ||||
-rw-r--r-- | mm/nobootmem.c | 21 | ||||
-rw-r--r-- | mm/nommu.c | 120 | ||||
-rw-r--r-- | mm/oom_kill.c | 165 | ||||
-rw-r--r-- | mm/page-writeback.c | 1241 | ||||
-rw-r--r-- | mm/page_alloc.c | 993 | ||||
-rw-r--r-- | mm/page_io.c | 9 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/page_owner.c | 9 | ||||
-rw-r--r-- | mm/percpu.c | 6 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 29 | ||||
-rw-r--r-- | mm/process_vm_access.c | 35 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 17 | ||||
-rw-r--r-- | mm/shmem.c | 78 | ||||
-rw-r--r-- | mm/slab.c | 23 | ||||
-rw-r--r-- | mm/slab.h | 1 | ||||
-rw-r--r-- | mm/slab_common.c | 93 | ||||
-rw-r--r-- | mm/slob.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 33 | ||||
-rw-r--r-- | mm/swap.c | 35 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 53 | ||||
-rw-r--r-- | mm/util.c | 41 | ||||
-rw-r--r-- | mm/vmalloc.c | 103 | ||||
-rw-r--r-- | mm/vmscan.c | 106 | ||||
-rw-r--r-- | mm/zbud.c | 23 | ||||
-rw-r--r-- | mm/zpool.c | 35 | ||||
-rw-r--r-- | mm/zsmalloc.c | 979 | ||||
-rw-r--r-- | mm/zswap.c | 12 |
67 files changed, 5926 insertions, 3174 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a03131b6ba8e..e79de2bd12cd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -368,6 +368,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select MEMORY_ISOLATION + select RAS help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running @@ -517,6 +518,12 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. +config CMA_DEBUGFS + bool "CMA debugfs interface" + depends on CMA && DEBUG_FS + help + Turns on the DebugFS interface for CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA @@ -629,3 +636,21 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. diff --git a/mm/Makefile b/mm/Makefile index 15dbe9903c27..98c4eaeabdcb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o @@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6dc4580df2af..dac5bf59309d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; @@ -48,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; + unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; @@ -66,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + wb_thresh = wb_calc_thresh(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", - (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), + (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), + (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), + K(wb_thresh), K(dirty_thresh), K(background_thresh), - (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), - (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), - (unsigned long) K(bdi->write_bandwidth), + (unsigned long) K(wb_stat(wb, WB_DIRTIED)), + (unsigned long) K(wb_stat(wb, WB_WRITTEN)), + (unsigned long) K(wb->write_bandwidth), nr_dirty, nr_io, nr_more_io, nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->state); + !list_empty(&bdi->bdi_list), bdi->wb.state); #undef K return 0; @@ -255,13 +256,8 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -int bdi_has_dirty_io(struct backing_dev_info *bdi) -{ - return wb_has_dirty_io(&bdi->wb); -} - /* - * This function is used when the first inode for this bdi is marked dirty. It + * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just @@ -274,178 +270,565 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_bh(&wb->work_lock); } /* - * Remove bdi from bdi_list, and ensure that it is no longer visible + * Initial write bandwidth: 100 MB/s */ -static void bdi_remove_from_list(struct backing_dev_info *bdi) +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + +static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, + int blkcg_id, gfp_t gfp) { - spin_lock_bh(&bdi_lock); - list_del_rcu(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); + int i, err; - synchronize_rcu_expedited(); -} + memset(wb, 0, sizeof(*wb)); -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...) -{ - va_list args; - struct device *dev; + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); + spin_lock_init(&wb->list_lock); - if (bdi->dev) /* The driver needs to use separate queues per device */ - return 0; + wb->bw_time_stamp = jiffies; + wb->balanced_dirty_ratelimit = INIT_BW; + wb->dirty_ratelimit = INIT_BW; + wb->write_bandwidth = INIT_BW; + wb->avg_write_bandwidth = INIT_BW; - va_start(args, fmt); - dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); - va_end(args); - if (IS_ERR(dev)) - return PTR_ERR(dev); + spin_lock_init(&wb->work_lock); + INIT_LIST_HEAD(&wb->work_list); + INIT_DELAYED_WORK(&wb->dwork, wb_workfn); - bdi->dev = dev; + wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); + if (!wb->congested) + return -ENOMEM; - bdi_debug_register(bdi, dev_name(dev)); - set_bit(BDI_registered, &bdi->state); + err = fprop_local_init_percpu(&wb->completions, gfp); + if (err) + goto out_put_cong; - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + for (i = 0; i < NR_WB_STAT_ITEMS; i++) { + err = percpu_counter_init(&wb->stat[i], 0, gfp); + if (err) + goto out_destroy_stat; + } - trace_writeback_bdi_register(bdi); return 0; -} -EXPORT_SYMBOL(bdi_register); -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) -{ - return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +out_destroy_stat: + while (--i) + percpu_counter_destroy(&wb->stat[i]); + fprop_local_destroy_percpu(&wb->completions); +out_put_cong: + wb_congested_put(wb->congested); + return err; } -EXPORT_SYMBOL(bdi_register_dev); /* * Remove bdi from the global list and shutdown any threads we have running */ -static void bdi_wb_shutdown(struct backing_dev_info *bdi) +static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - if (!test_and_clear_bit(BDI_registered, &bdi->state)) { - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (!test_and_clear_bit(WB_registered, &wb->state)) { + spin_unlock_bh(&wb->work_lock); return; } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); /* - * Make sure nobody finds us on the bdi_list anymore + * Drain work list and shutdown the delayed_work. !WB_registered + * tells wb_workfn() that @wb is dying and its work_list needs to + * be drained no matter what. */ - bdi_remove_from_list(bdi); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + flush_delayed_work(&wb->dwork); + WARN_ON(!list_empty(&wb->work_list)); +} - /* - * Drain work list and shutdown the delayed_work. At this point, - * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi - * is dying and its work_list needs to be drained no matter what. - */ - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - flush_delayed_work(&bdi->wb.dwork); +static void wb_exit(struct bdi_writeback *wb) +{ + int i; + + WARN_ON(delayed_work_pending(&wb->dwork)); + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) + percpu_counter_destroy(&wb->stat[i]); + + fprop_local_destroy_percpu(&wb->completions); + wb_congested_put(wb->congested); } +#ifdef CONFIG_CGROUP_WRITEBACK + +#include <linux/memcontrol.h> + /* - * Called when the device behind @bdi has been removed or ejected. + * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, + * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU + * protected. cgwb_release_wait is used to wait for the completion of cgwb + * releases from bdi destruction path. + */ +static DEFINE_SPINLOCK(cgwb_lock); +static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); + +/** + * wb_congested_get_create - get or create a wb_congested + * @bdi: associated bdi + * @blkcg_id: ID of the associated blkcg + * @gfp: allocation mask + * + * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. + * The returned wb_congested has its reference count incremented. Returns + * NULL on failure. + */ +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + struct bdi_writeback_congested *new_congested = NULL, *congested; + struct rb_node **node, *parent; + unsigned long flags; +retry: + spin_lock_irqsave(&cgwb_lock, flags); + + node = &bdi->cgwb_congested_tree.rb_node; + parent = NULL; + + while (*node != NULL) { + parent = *node; + congested = container_of(parent, struct bdi_writeback_congested, + rb_node); + if (congested->blkcg_id < blkcg_id) + node = &parent->rb_left; + else if (congested->blkcg_id > blkcg_id) + node = &parent->rb_right; + else + goto found; + } + + if (new_congested) { + /* !found and storage for new one already allocated, insert */ + congested = new_congested; + new_congested = NULL; + rb_link_node(&congested->rb_node, parent, node); + rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); + goto found; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + + /* allocate storage for new one and retry */ + new_congested = kzalloc(sizeof(*new_congested), gfp); + if (!new_congested) + return NULL; + + atomic_set(&new_congested->refcnt, 0); + new_congested->bdi = bdi; + new_congested->blkcg_id = blkcg_id; + goto retry; + +found: + atomic_inc(&congested->refcnt); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(new_congested); + return congested; +} + +/** + * wb_congested_put - put a wb_congested + * @congested: wb_congested to put * - * We can't really do much here except for reducing the dirty ratio at - * the moment. In the future we should be able to set a flag so that - * the filesystem can handle errors at mark_inode_dirty time instead - * of only at writeback time. + * Put @congested and destroy it if the refcnt reaches zero. */ -void bdi_unregister(struct backing_dev_info *bdi) +void wb_congested_put(struct bdi_writeback_congested *congested) { - if (WARN_ON_ONCE(!bdi->dev)) + unsigned long flags; + + local_irq_save(flags); + if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + local_irq_restore(flags); return; + } - bdi_set_min_ratio(bdi, 0); + /* bdi might already have been destroyed leaving @congested unlinked */ + if (congested->bdi) { + rb_erase(&congested->rb_node, + &congested->bdi->cgwb_congested_tree); + congested->bdi = NULL; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(congested); } -EXPORT_SYMBOL(bdi_unregister); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +static void cgwb_release_workfn(struct work_struct *work) { - memset(wb, 0, sizeof(*wb)); + struct bdi_writeback *wb = container_of(work, struct bdi_writeback, + release_work); + struct backing_dev_info *bdi = wb->bdi; - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); - INIT_LIST_HEAD(&wb->b_dirty_time); - spin_lock_init(&wb->list_lock); - INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); + wb_shutdown(wb); + + css_put(wb->memcg_css); + css_put(wb->blkcg_css); + + fprop_local_destroy_percpu(&wb->memcg_completions); + percpu_ref_exit(&wb->refcnt); + wb_exit(wb); + kfree_rcu(wb, rcu); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); } -/* - * Initial write bandwidth: 100 MB/s +static void cgwb_release(struct percpu_ref *refcnt) +{ + struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, + refcnt); + schedule_work(&wb->release_work); +} + +static void cgwb_kill(struct bdi_writeback *wb) +{ + lockdep_assert_held(&cgwb_lock); + + WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + list_del(&wb->memcg_node); + list_del(&wb->blkcg_node); + percpu_ref_kill(&wb->refcnt); +} + +static int cgwb_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct cgroup_subsys_state *blkcg_css; + struct blkcg *blkcg; + struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; + struct bdi_writeback *wb; + unsigned long flags; + int ret = 0; + + memcg = mem_cgroup_from_css(memcg_css); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg = css_to_blkcg(blkcg_css); + memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + blkcg_cgwb_list = &blkcg->cgwb_list; + + /* look up again under lock and discard on blkcg mismatch */ + spin_lock_irqsave(&cgwb_lock, flags); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb && wb->blkcg_css != blkcg_css) { + cgwb_kill(wb); + wb = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (wb) + goto out_put; + + /* need to create a new one */ + wb = kmalloc(sizeof(*wb), gfp); + if (!wb) + return -ENOMEM; + + ret = wb_init(wb, bdi, blkcg_css->id, gfp); + if (ret) + goto err_free; + + ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); + if (ret) + goto err_wb_exit; + + ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); + if (ret) + goto err_ref_exit; + + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); + + /* + * The root wb determines the registered state of the whole bdi and + * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate + * whether they're still online. Don't link @wb if any is dead. + * See wb_memcg_offline() and wb_blkcg_offline(). + */ + ret = -ENODEV; + spin_lock_irqsave(&cgwb_lock, flags); + if (test_bit(WB_registered, &bdi->wb.state) && + blkcg_cgwb_list->next && memcg_cgwb_list->next) { + /* we might have raced another instance of this function */ + ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + if (!ret) { + atomic_inc(&bdi->usage_cnt); + list_add(&wb->memcg_node, memcg_cgwb_list); + list_add(&wb->blkcg_node, blkcg_cgwb_list); + css_get(memcg_css); + css_get(blkcg_css); + } + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto err_fprop_exit; + } + goto out_put; + +err_fprop_exit: + fprop_local_destroy_percpu(&wb->memcg_completions); +err_ref_exit: + percpu_ref_exit(&wb->refcnt); +err_wb_exit: + wb_exit(wb); +err_free: + kfree(wb); +out_put: + css_put(blkcg_css); + return ret; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. The returned wb has its refcount incremented. + * + * This function uses css_get() on @memcg_css and thus expects its refcnt + * to be positive on invocation. IOW, rcu_read_lock() protection on + * @memcg_css isn't enough. try_get it before calling this function. + * + * A wb is keyed by its associated memcg. As blkcg implicitly enables + * memcg on the default hierarchy, memcg association is guaranteed to be + * more specific (equal or descendant to the associated blkcg) and thus can + * identify both the memcg and blkcg associations. + * + * Because the blkcg associated with a memcg may change as blkcg is enabled + * and disabled closer to root in the hierarchy, each wb keeps track of + * both the memcg and blkcg associated with it and verifies the blkcg on + * each lookup. On mismatch, the existing wb is discarded and a new one is + * created. */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp) +{ + struct bdi_writeback *wb; + + might_sleep_if(gfp & __GFP_WAIT); + + if (!memcg_css->parent) + return &bdi->wb; + + do { + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, + &blkio_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || + !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + + return wb; +} -int bdi_init(struct backing_dev_info *bdi) +static int cgwb_bdi_init(struct backing_dev_info *bdi) { - int i, err; + int ret; + + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + bdi->cgwb_congested_tree = RB_ROOT; + atomic_set(&bdi->usage_cnt, 1); + + ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + if (!ret) { + bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.blkcg_css = blkcg_root_css; + } + return ret; +} + +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +{ + struct radix_tree_iter iter; + struct bdi_writeback_congested *congested, *congested_n; + void **slot; + + WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + + spin_lock_irq(&cgwb_lock); + + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) + cgwb_kill(*slot); + rbtree_postorder_for_each_entry_safe(congested, congested_n, + &bdi->cgwb_congested_tree, rb_node) { + rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree); + congested->bdi = NULL; /* mark @congested unlinked */ + } + + spin_unlock_irq(&cgwb_lock); + + /* + * All cgwb's and their congested states must be shutdown and + * released before returning. Drain the usage counter to wait for + * all cgwb's and cgwb_congested's ever created on @bdi. + */ + atomic_dec(&bdi->usage_cnt); + wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); +} + +/** + * wb_memcg_offline - kill all wb's associated with a memcg being offlined + * @memcg: memcg being offlined + * + * Also prevents creation of any new wb's associated with @memcg. + */ +void wb_memcg_offline(struct mem_cgroup *memcg) +{ + LIST_HEAD(to_destroy); + struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) + cgwb_kill(wb); + memcg_cgwb_list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +/** + * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined + * @blkcg: blkcg being offlined + * + * Also prevents creation of any new wb's associated with @blkcg. + */ +void wb_blkcg_offline(struct blkcg *blkcg) +{ + LIST_HEAD(to_destroy); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + cgwb_kill(wb); + blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int cgwb_bdi_init(struct backing_dev_info *bdi) +{ + int err; + + bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL); + if (!bdi->wb_congested) + return -ENOMEM; + + err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + if (err) { + kfree(bdi->wb_congested); + return err; + } + return 0; +} + +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +int bdi_init(struct backing_dev_info *bdi) +{ bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; - spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->work_list); + init_waitqueue_head(&bdi->wb_waitq); - bdi_wb_init(&bdi->wb, bdi); + return cgwb_bdi_init(bdi); +} +EXPORT_SYMBOL(bdi_init); - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); - if (err) - goto err; - } +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + va_list args; + struct device *dev; - bdi->dirty_exceeded = 0; + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; - bdi->bw_time_stamp = jiffies; - bdi->written_stamp = 0; + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); - bdi->balanced_dirty_ratelimit = INIT_BW; - bdi->dirty_ratelimit = INIT_BW; - bdi->write_bandwidth = INIT_BW; - bdi->avg_write_bandwidth = INIT_BW; + bdi->dev = dev; - err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); - if (err) { -err: - while (i--) - percpu_counter_destroy(&bdi->bdi_stat[i]); - } + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); - return err; + trace_writeback_bdi_register(bdi); + return 0; } -EXPORT_SYMBOL(bdi_init); +EXPORT_SYMBOL(bdi_register); -void bdi_destroy(struct backing_dev_info *bdi) +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) { - int i; + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); - bdi_wb_shutdown(bdi); +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + synchronize_rcu_expedited(); +} + +void bdi_destroy(struct backing_dev_info *bdi) +{ + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); @@ -453,9 +836,7 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi->dev = NULL; } - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) - percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); + wb_exit(&bdi->wb); } EXPORT_SYMBOL(bdi_destroy); @@ -488,31 +869,31 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; -static atomic_t nr_bdi_congested[2]; +static atomic_t nr_wb_congested[2]; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum bdi_state bit; wait_queue_head_t *wqh = &congestion_wqh[sync]; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (test_and_clear_bit(bit, &bdi->state)) - atomic_dec(&nr_bdi_congested[sync]); + bit = sync ? WB_sync_congested : WB_async_congested; + if (test_and_clear_bit(bit, &congested->state)) + atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } -EXPORT_SYMBOL(clear_bdi_congested); +EXPORT_SYMBOL(clear_wb_congested); -void set_bdi_congested(struct backing_dev_info *bdi, int sync) +void set_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum bdi_state bit; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (!test_and_set_bit(bit, &bdi->state)) - atomic_inc(&nr_bdi_congested[sync]); + bit = sync ? WB_sync_congested : WB_async_congested; + if (!test_and_set_bit(bit, &congested->state)) + atomic_inc(&nr_wb_congested[sync]); } -EXPORT_SYMBOL(set_bdi_congested); +EXPORT_SYMBOL(set_wb_congested); /** * congestion_wait - wait for a backing_dev to become uncongested @@ -571,7 +952,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * encountered in the current zone, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_bdi_congested[sync]) == 0 || + if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { cond_resched(); diff --git a/mm/bootmem.c b/mm/bootmem.c index 477be696511d..a23dd1934654 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) return 0; @@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long cur = start; + cur = start; start = ALIGN(start + 1, BITS_PER_LONG); while (vec && cur != start) { if (vec & 1) { page = pfn_to_page(cur); - __free_pages_bootmem(page, 0); + __free_pages_bootmem(page, cur, 0); count++; } vec >>= 1; @@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } } + cur = bdata->node_min_pfn; page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) - __free_pages_bootmem(page++, 0); + __free_pages_bootmem(page++, cur++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/cleancache.c b/mm/cleancache.c index 053bcd8f12fb..8fc50811119b 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,7 +19,7 @@ #include <linux/cleancache.h> /* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ static struct cleancache_ops *cleancache_ops __read_mostly; @@ -34,145 +34,107 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; -/* - * When no backend is registered all calls to init_fs and init_shared_fs - * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or - * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array - * [shared_|]fs_poolid_map) are given to the respective super block - * (sb->cleancache_poolid) and no tmem_pools are created. When a backend - * registers with cleancache the previous calls to init_fs and init_shared_fs - * are executed to create tmem_pools and set the respective poolids. While no - * backend is registered all "puts", "gets" and "flushes" are ignored or failed. - */ -#define MAX_INITIALIZABLE_FS 32 -#define FAKE_FS_POOLID_OFFSET 1000 -#define FAKE_SHARED_FS_POOLID_OFFSET 2000 - -#define FS_NO_BACKEND (-1) -#define FS_UNKNOWN (-2) -static int fs_poolid_map[MAX_INITIALIZABLE_FS]; -static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; -static char *uuids[MAX_INITIALIZABLE_FS]; -/* - * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads - * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple - * threads calling mount (and ending up in __cleancache_init_[shared|]fs). - */ -static DEFINE_MUTEX(poolid_mutex); -/* - * When set to false (default) all calls to the cleancache functions, except - * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded - * by the if (!cleancache_ops) return. This means multiple threads (from - * different filesystems) will be checking cleancache_ops. The usage of a - * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are - * OK if the time between the backend's have been initialized (and - * cleancache_ops has been set to not NULL) and when the filesystems start - * actually calling the backends. The inverse (when unloading) is obviously - * not good - but this shim does not do that (yet). - */ - -/* - * The backends and filesystems work all asynchronously. This is b/c the - * backends can be built as modules. - * The usual sequence of events is: - * a) mount / -> __cleancache_init_fs is called. We set the - * [shared_|]fs_poolid_map and uuids for. - * - * b). user does I/Os -> we call the rest of __cleancache_* functions - * which return immediately as cleancache_ops is false. - * - * c). modprobe zcache -> cleancache_register_ops. We init the backend - * and set cleancache_ops to true, and for any fs_poolid_map - * (which is set by __cleancache_init_fs) we initialize the poolid. - * - * d). user does I/Os -> now that cleancache_ops is true all the - * __cleancache_* functions can call the backend. They all check - * that fs_poolid_map is valid and if so invoke the backend. - * - * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is - * reset (which is the second check in the __cleancache_* ops - * to call the backend). - * - * The sequence of event could also be c), followed by a), and d). and e). The - * c) would not happen anymore. There is also the chance of c), and one thread - * doing a) + d), and another doing e). For that case we depend on the - * filesystem calling __cleancache_invalidate_fs in the proper sequence (so - * that it handles all I/Os before it invalidates the fs (which is last part - * of unmounting process). - * - * Note: The acute reader will notice that there is no "rmmod zcache" case. - * This is b/c the functionality for that is not yet implemented and when - * done, will require some extra locking not yet devised. - */ +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} /* - * Register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for cleancache. Returns 0 on success. */ -struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops *old = cleancache_ops; - int i; + if (cmpxchg(&cleancache_ops, NULL, ops)) + return -EBUSY; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_NO_BACKEND) - fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); - if (shared_fs_poolid_map[i] == FS_NO_BACKEND) - shared_fs_poolid_map[i] = ops->init_shared_fs - (uuids[i], PAGE_SIZE); - } /* - * We MUST set cleancache_ops _after_ we have called the backends - * init_fs or init_shared_fs functions. Otherwise the compiler might - * re-order where cleancache_ops is set in this function. + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. */ - barrier(); - cleancache_ops = ops; - mutex_unlock(&poolid_mutex); - return old; + iterate_supers(cleancache_register_ops_sb, NULL); + return 0; } EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; - if (cleancache_ops) - fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); - else - fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (shared_fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = uuid; - if (cleancache_ops) - shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (uuid, PAGE_SIZE); - else - shared_fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode, } /* - * Returns a pool_id that is associated with a given fake poolid. - */ -static int get_poolid_from_fake(int fake_pool_id) -{ - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) - return shared_fs_poolid_map[fake_pool_id - - FAKE_SHARED_FS_POOLID_OFFSET]; - else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) - return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; - return FS_NO_BACKEND; -} - -/* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if * successful, use it to fill the specified page with data and return 0. @@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) goto out; - pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - if (pool_id >= 0) - ret = cleancache_ops->get_page(pool_id, - key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page); void __cleancache_put_page(struct page *page) { int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - + pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && cleancache_get_key(page->mapping->host, &key) >= 0) { cleancache_ops->put_page(pool_id, key, page->index, page); @@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id >= 0) { - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id < 0) - return; - + if (pool_id >= 0) { VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { cleancache_ops->invalidate_page(pool_id, @@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) cleancache_ops->invalidate_inode(pool_id, key); } @@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode); */ void __cleancache_invalidate_fs(struct super_block *sb) { - int index; - int fake_pool_id = sb->cleancache_poolid; - int old_poolid = fake_pool_id; + int pool_id; - mutex_lock(&poolid_mutex); - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; - old_poolid = shared_fs_poolid_map[index]; - shared_fs_poolid_map[index] = FS_UNKNOWN; - uuids[index] = NULL; - } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_FS_POOLID_OFFSET; - old_poolid = fs_poolid_map[index]; - fs_poolid_map[index] = FS_UNKNOWN; - } - sb->cleancache_poolid = -1; - if (cleancache_ops) - cleancache_ops->invalidate_fs(old_poolid); - mutex_unlock(&poolid_mutex); + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { - int i; - #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -400,10 +314,6 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - fs_poolid_map[i] = FS_UNKNOWN; - shared_fs_poolid_map[i] = FS_UNKNOWN; - } return 0; } module_init(init_cleancache) @@ -23,6 +23,7 @@ # define DEBUG #endif #endif +#define CREATE_TRACE_POINTS #include <linux/memblock.h> #include <linux/err.h> @@ -34,30 +35,26 @@ #include <linux/cma.h> #include <linux/highmem.h> #include <linux/io.h> +#include <trace/events/cma.h> -struct cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; - unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; -}; +#include "cma.h" -static struct cma cma_areas[MAX_CMA_AREAS]; -static unsigned cma_area_count; +struct cma cma_areas[MAX_CMA_AREAS]; +unsigned cma_area_count; static DEFINE_MUTEX(cma_mutex); -phys_addr_t cma_get_base(struct cma *cma) +phys_addr_t cma_get_base(const struct cma *cma) { return PFN_PHYS(cma->base_pfn); } -unsigned long cma_get_size(struct cma *cma) +unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } -static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -68,7 +65,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) * Find a PFN aligned to the specified order and return an offset represented in * order_per_bits. */ -static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -77,18 +75,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) - cma->base_pfn) >> cma->order_per_bit; } -static unsigned long cma_bitmap_maxno(struct cma *cma) -{ - return cma->count >> cma->order_per_bit; -} - -static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, - unsigned long pages) +static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, + unsigned long pages) { return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } -static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, + unsigned int count) { unsigned long bitmap_no, bitmap_count; @@ -134,6 +128,12 @@ static int __init cma_activate_area(struct cma *cma) } while (--i); mutex_init(&cma->lock); + +#ifdef CONFIG_CMA_DEBUGFS + INIT_HLIST_HEAD(&cma->mem_head); + spin_lock_init(&cma->mem_head_lock); +#endif + return 0; err: @@ -167,7 +167,8 @@ core_initcall(cma_init_reserved_areas); * This function creates custom contiguous area from already reserved memory. */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, - int order_per_bit, struct cma **res_cma) + unsigned int order_per_bit, + struct cma **res_cma) { struct cma *cma; phys_addr_t alignment; @@ -181,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment requied by mm core */ + /* ensure minimal alignment required by mm core */ alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); /* alignment should be aligned with order_per_bit */ @@ -237,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base, /* * high_memory isn't direct mapped memory so retrieving its physical * address isn't appropriate. But it would be useful to check the - * physical address of the highmem boundary so it's justfiable to get + * physical address of the highmem boundary so it's justifiable to get * the physical address from it. On x86 there is a validation check for * this case, so the following workaround is needed to avoid it. */ @@ -315,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base, */ if (base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range(size, alignment, - highmem_start, limit); + highmem_start, limit, + MEMBLOCK_NONE); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range(size, alignment, base, - limit); + limit, + MEMBLOCK_NONE); if (!addr) { ret = -ENOMEM; goto err; @@ -358,7 +361,7 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) { unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; @@ -415,6 +418,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) start = bitmap_no + mask + 1; } + trace_cma_alloc(page ? pfn : -1UL, page, count, align); + pr_debug("%s(): returned %p\n", __func__, page); return page; } @@ -429,7 +434,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, struct page *pages, int count) +bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) { unsigned long pfn; @@ -447,6 +452,7 @@ bool cma_release(struct cma *cma, struct page *pages, int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); + trace_cma_release(pfn, pages, count); return true; } diff --git a/mm/cma.h b/mm/cma.h new file mode 100644 index 000000000000..1132d733556d --- /dev/null +++ b/mm/cma.h @@ -0,0 +1,24 @@ +#ifndef __MM_CMA_H__ +#define __MM_CMA_H__ + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +#ifdef CONFIG_CMA_DEBUGFS + struct hlist_head mem_head; + spinlock_t mem_head_lock; +#endif +}; + +extern struct cma cma_areas[MAX_CMA_AREAS]; +extern unsigned cma_area_count; + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +#endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c new file mode 100644 index 000000000000..f8e4b60db167 --- /dev/null +++ b/mm/cma_debug.c @@ -0,0 +1,206 @@ +/* + * CMA DebugFS Interface + * + * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com> + */ + + +#include <linux/debugfs.h> +#include <linux/cma.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm_types.h> + +#include "cma.h" + +struct cma_mem { + struct hlist_node node; + struct page *p; + unsigned long n; +}; + +static struct dentry *cma_debugfs_root; + +static int cma_debugfs_get(void *data, u64 *val) +{ + unsigned long *p = data; + + *val = *p; + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); + +static int cma_used_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long used; + + mutex_lock(&cma->lock); + /* pages counter is smaller than sizeof(int) */ + used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); + mutex_unlock(&cma->lock); + *val = (u64)used << cma->order_per_bit; + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); + +static int cma_maxchunk_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long maxchunk = 0; + unsigned long start, end = 0; + unsigned long bitmap_maxno = cma_bitmap_maxno(cma); + + mutex_lock(&cma->lock); + for (;;) { + start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); + if (start >= cma->count) + break; + end = find_next_bit(cma->bitmap, bitmap_maxno, start); + maxchunk = max(end - start, maxchunk); + } + mutex_unlock(&cma->lock); + *val = (u64)maxchunk << cma->order_per_bit; + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); + +static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) +{ + spin_lock(&cma->mem_head_lock); + hlist_add_head(&mem->node, &cma->mem_head); + spin_unlock(&cma->mem_head_lock); +} + +static struct cma_mem *cma_get_entry_from_list(struct cma *cma) +{ + struct cma_mem *mem = NULL; + + spin_lock(&cma->mem_head_lock); + if (!hlist_empty(&cma->mem_head)) { + mem = hlist_entry(cma->mem_head.first, struct cma_mem, node); + hlist_del_init(&mem->node); + } + spin_unlock(&cma->mem_head_lock); + + return mem; +} + +static int cma_free_mem(struct cma *cma, int count) +{ + struct cma_mem *mem = NULL; + + while (count) { + mem = cma_get_entry_from_list(cma); + if (mem == NULL) + return 0; + + if (mem->n <= count) { + cma_release(cma, mem->p, mem->n); + count -= mem->n; + kfree(mem); + } else if (cma->order_per_bit == 0) { + cma_release(cma, mem->p, count); + mem->p += count; + mem->n -= count; + count = 0; + cma_add_to_cma_mem_list(cma, mem); + } else { + pr_debug("cma: cannot release partial block when order_per_bit != 0\n"); + cma_add_to_cma_mem_list(cma, mem); + break; + } + } + + return 0; + +} + +static int cma_free_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_free_mem(cma, pages); +} +DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); + +static int cma_alloc_mem(struct cma *cma, int count) +{ + struct cma_mem *mem; + struct page *p; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + p = cma_alloc(cma, count, 0); + if (!p) { + kfree(mem); + return -ENOMEM; + } + + mem->p = p; + mem->n = count; + + cma_add_to_cma_mem_list(cma, mem); + + return 0; +} + +static int cma_alloc_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_alloc_mem(cma, pages); +} +DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); + +static void cma_debugfs_add_one(struct cma *cma, int idx) +{ + struct dentry *tmp; + char name[16]; + int u32s; + + sprintf(name, "cma-%d", idx); + + tmp = debugfs_create_dir(name, cma_debugfs_root); + + debugfs_create_file("alloc", S_IWUSR, tmp, cma, + &cma_alloc_fops); + + debugfs_create_file("free", S_IWUSR, tmp, cma, + &cma_free_fops); + + debugfs_create_file("base_pfn", S_IRUGO, tmp, + &cma->base_pfn, &cma_debugfs_fops); + debugfs_create_file("count", S_IRUGO, tmp, + &cma->count, &cma_debugfs_fops); + debugfs_create_file("order_per_bit", S_IRUGO, tmp, + &cma->order_per_bit, &cma_debugfs_fops); + debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); + debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); + + u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); + debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); +} + +static int __init cma_debugfs_init(void) +{ + int i; + + cma_debugfs_root = debugfs_create_dir("cma", NULL); + if (!cma_debugfs_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) + cma_debugfs_add_one(&cma_areas[i], i); + + return 0; +} +late_initcall(cma_debugfs_init); diff --git a/mm/compaction.c b/mm/compaction.c index 8c0d9459b54a..018f08da99a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc) return false; } -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - /* If the page is a large free page, then disallow migration */ - if (PageBuddy(page)) { - /* - * We are checking page_order without zone->lock taken. But - * the only small danger is that we skip a potentially suitable - * pageblock, so it's not worth to check order for valid range. - */ - if (page_order_unsafe(page) >= pageblock_order) - return false; - } - - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) - return true; - - /* Otherwise skip the block */ - return false; -} - /* * Isolate free pages onto a private freelist. If @strict is true, will abort * returning 0 on any invalid PFNs or non-free pages inside of the pageblock @@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + /* If the page is a large free page, then disallow migration */ + if (PageBuddy(page)) { + /* + * We are checking page_order without zone->lock taken. But + * the only small danger is that we skip a potentially suitable + * pageblock, so it's not worth to check order for valid range. + */ + if (page_order_unsafe(page) >= pageblock_order) + return false; + } + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(get_pageblock_migratetype(page))) + return true; + + /* Otherwise skip the block */ + return false; +} + /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. @@ -1047,6 +1048,12 @@ typedef enum { } isolate_migrate_t; /* + * Allow userspace to control policy on scanning the unevictable LRU for + * compactable pages. + */ +int sysctl_compact_unevictable_allowed __read_mostly = 1; + +/* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within * compact_control. @@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long low_pfn, end_pfn; struct page *page; const isolate_mode_t isolate_mode = + (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); /* @@ -1174,13 +1182,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; + bool can_steal; /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) return COMPACT_PARTIAL; - /* Job done if allocation would set block type */ - if (order >= pageblock_order && area->nr_free) +#ifdef CONFIG_CMA + /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ + if (migratetype == MIGRATE_MOVABLE && + !list_empty(&area->free_list[MIGRATE_CMA])) + return COMPACT_PARTIAL; +#endif + /* + * Job done if allocation would steal freepages from + * other migratetype buddy lists. + */ + if (find_suitable_fallback(area, order, migratetype, + true, &can_steal) != -1) return COMPACT_PARTIAL; } @@ -1587,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); + /* + * When called via /proc/sys/vm/compact_memory + * this makes sure we compact the whole zone regardless of + * cached scanner positions. + */ + if (cc->order == -1) + __reset_isolation_suitable(zone); + if (cc->order == -1 || !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); diff --git a/mm/debug.c b/mm/debug.c index 3eb3ac2fcee7..76089ddf99ea 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/memcontrol.h> static const struct trace_print_flags pageflag_names[] = { diff --git a/mm/fadvise.c b/mm/fadvise.c index 4a3907cf79f8..b8a5bc66b0c0 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(mapping->host)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..1283fc825458 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -13,7 +13,6 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> @@ -101,6 +100,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -175,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock and + * mem_cgroup_begin_page_stat(). */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg) { struct address_space *mapping = page->mapping; @@ -197,22 +199,24 @@ void __delete_from_page_cache(struct page *page, void *shadow) page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - __dec_zone_page_state(page, NR_FILE_PAGES); + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(page)) + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* - * Some filesystems seem to re-dirty the page even after - * the VM has canceled the dirty bit (eg ext3 journaling). + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. * - * Fix it up by doing a final dirty accounting check after - * having removed the page entirely. + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. */ - if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - } + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping, memcg, + inode_to_wb(mapping->host)); } /** @@ -226,14 +230,20 @@ void __delete_from_page_cache(struct page *page, void *shadow) void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); freepage = mapping->a_ops->freepage; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -283,7 +293,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); return ret; } @@ -472,6 +484,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); + struct mem_cgroup *memcg; + unsigned long flags; pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; @@ -480,15 +494,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old, NULL); + memcg = mem_cgroup_begin_page_stat(old); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(old, NULL, memcg); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; - __inc_zone_page_state(new, NR_FILE_PAGES); + + /* + * hugetlb pages do not participate in page cache accounting. + */ + if (!PageHuge(new)) + __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) @@ -577,7 +598,10 @@ static int __add_to_page_cache_locked(struct page *page, radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - __inc_zone_page_state(page, NR_FILE_PAGES); + + /* hugetlb pages do not participate in page cache accounting. */ + if (!huge) + __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_commit_charge(page, memcg, false); @@ -1656,8 +1680,8 @@ no_cached_page: error = -ENOMEM; goto out; } - error = add_to_page_cache_lru(page, mapping, - index, GFP_KERNEL); + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1695,7 +1719,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; - if (io_is_direct(file)) { + if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); @@ -1708,7 +1732,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) pos + count - 1); if (!retval) { struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); + retval = mapping->a_ops->direct_IO(iocb, &data, pos); } if (retval > 0) { @@ -1758,7 +1782,8 @@ static int page_cache_read(struct file *file, pgoff_t offset) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2261,41 +2286,38 @@ EXPORT_SYMBOL(read_cache_page_gfp); * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ -inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; unsigned long limit = rlimit(RLIMIT_FSIZE); + loff_t pos; - if (unlikely(*pos < 0)) - return -EINVAL; + if (!iov_iter_count(from)) + return 0; - if (!isblk) { - /* FIXME: this is for backwards compatibility with 2.4 */ - if (file->f_flags & O_APPEND) - *pos = i_size_read(inode); + /* FIXME: this is for backwards compatibility with 2.4 */ + if (iocb->ki_flags & IOCB_APPEND) + iocb->ki_pos = i_size_read(inode); - if (limit != RLIM_INFINITY) { - if (*pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - if (*count > limit - (typeof(limit))*pos) { - *count = limit - (typeof(limit))*pos; - } + pos = iocb->ki_pos; + + if (limit != RLIM_INFINITY) { + if (iocb->ki_pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; } + iov_iter_truncate(from, limit - (unsigned long)pos); } /* * LFS rule */ - if (unlikely(*pos + *count > MAX_NON_LFS && + if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && !(file->f_flags & O_LARGEFILE))) { - if (*pos >= MAX_NON_LFS) { + if (pos >= MAX_NON_LFS) return -EFBIG; - } - if (*count > MAX_NON_LFS - (unsigned long)*pos) { - *count = MAX_NON_LFS - (unsigned long)*pos; - } + iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); } /* @@ -2305,34 +2327,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i * exceeded without writing data we send a signal and return EFBIG. * Linus frestrict idea will clean these up nicely.. */ - if (likely(!isblk)) { - if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { - if (*count || *pos > inode->i_sb->s_maxbytes) { - return -EFBIG; - } - /* zero-length writes at ->s_maxbytes are OK */ - } - - if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) - *count = inode->i_sb->s_maxbytes - *pos; - } else { -#ifdef CONFIG_BLOCK - loff_t isize; - if (bdev_read_only(I_BDEV(inode))) - return -EPERM; - isize = i_size_read(inode); - if (*pos >= isize) { - if (*count || *pos > isize) - return -ENOSPC; - } + if (unlikely(pos >= inode->i_sb->s_maxbytes)) + return -EFBIG; - if (*pos + *count > isize) - *count = isize - *pos; -#else - return -EPERM; -#endif - } - return 0; + iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); + return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); @@ -2396,7 +2395,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) } data = *from; - written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); + written = mapping->a_ops->direct_IO(iocb, &data, pos); /* * Finally, try again to invalidate clean pages which might have been @@ -2558,24 +2557,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; ssize_t written = 0; ssize_t err; ssize_t status; - size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - - if (count == 0) - goto out; - - iov_iter_truncate(from, count); - - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -2583,10 +2571,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - if (io_is_direct(file)) { - loff_t endbyte; + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos, endbyte; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from, iocb->ki_pos); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -2594,13 +2582,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || written == count || IS_DAX(inode)) + if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) goto out; - pos += written; - count -= written; - - status = generic_perform_write(file, from, pos); + status = generic_perform_write(file, from, pos = iocb->ki_pos); /* * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were @@ -2612,15 +2597,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) err = status; goto out; } - iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ endbyte = pos + status - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = filemap_write_and_wait_range(mapping, pos, endbyte); if (err == 0) { + iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, @@ -2632,9 +2617,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) */ } } else { - written = generic_perform_write(file, from, pos); - if (likely(written >= 0)) - iocb->ki_pos = pos + written; + written = generic_perform_write(file, from, iocb->ki_pos); + if (likely(written > 0)) + iocb->ki_pos += written; } out: current->backing_dev_info = NULL; @@ -2658,7 +2643,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; mutex_lock(&inode->i_mutex); - ret = __generic_file_write_iter(iocb, from); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (ret > 0) { diff --git a/mm/frontswap.c b/mm/frontswap.c index 8d82809eb085..27a9924caf61 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -21,11 +21,16 @@ #include <linux/swapfile.h> /* - * frontswap_ops is set by frontswap_register_ops to contain the pointers - * to the frontswap "backend" implementation functions. + * frontswap_ops are added by frontswap_register_ops, and provide the + * frontswap "backend" implementation functions. Multiple implementations + * may be registered, but implementations can never deregister. This + * is a simple singly-linked list of all registered implementations. */ static struct frontswap_ops *frontswap_ops __read_mostly; +#define for_each_frontswap_ops(ops) \ + for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) + /* * If enabled, frontswap_store will return failure even on success. As * a result, the swap subsystem will always write the page to swap, in @@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { } * on all frontswap functions to not call the backend until the backend * has registered. * - * Specifically when no backend is registered (nobody called - * frontswap_register_ops) all calls to frontswap_init (which is done via - * swapon -> enable_swap_info -> frontswap_init) are registered and remembered - * (via the setting of need_init bitmap) but fail to create tmem_pools. When a - * backend registers with frontswap at some later point the previous - * calls to frontswap_init are executed (by iterating over the need_init - * bitmap) to create tmem_pools and set the respective poolids. All of that is - * guarded by us using atomic bit operations on the 'need_init' bitmap. - * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). * Fortunatly for us, the swapon_mutex has been taked by the callee so we are @@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { } * * Obviously the opposite (unloading the backend) must be done after all * the frontswap_[store|load|invalidate_area|invalidate_page] start - * ignorning or failing the requests - at which point frontswap_ops - * would have to be made in some fashion atomic. + * ignoring or failing the requests. However, there is currently no way + * to unload a backend once it is registered. */ -static DECLARE_BITMAP(need_init, MAX_SWAPFILES); /* - * Register operations for frontswap, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for frontswap */ -struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) +void frontswap_register_ops(struct frontswap_ops *ops) { - struct frontswap_ops *old = frontswap_ops; - int i; - - for (i = 0; i < MAX_SWAPFILES; i++) { - if (test_and_clear_bit(i, need_init)) { - struct swap_info_struct *sis = swap_info[i]; - /* __frontswap_init _should_ have set it! */ - if (!sis->frontswap_map) - return ERR_PTR(-EINVAL); - ops->init(i); - } + DECLARE_BITMAP(a, MAX_SWAPFILES); + DECLARE_BITMAP(b, MAX_SWAPFILES); + struct swap_info_struct *si; + unsigned int i; + + bitmap_zero(a, MAX_SWAPFILES); + bitmap_zero(b, MAX_SWAPFILES); + + spin_lock(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + if (!WARN_ON(!si->frontswap_map)) + set_bit(si->type, a); } + spin_unlock(&swap_lock); + + /* the new ops needs to know the currently active swap devices */ + for_each_set_bit(i, a, MAX_SWAPFILES) + ops->init(i); + /* - * We MUST have frontswap_ops set _after_ the frontswap_init's - * have been called. Otherwise __frontswap_store might fail. Hence - * the barrier to make sure compiler does not re-order us. + * Setting frontswap_ops must happen after the ops->init() calls + * above; cmpxchg implies smp_mb() which will ensure the init is + * complete at this point. */ - barrier(); - frontswap_ops = ops; - return old; + do { + ops->next = frontswap_ops; + } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + + spin_lock(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + if (si->frontswap_map) + set_bit(si->type, b); + } + spin_unlock(&swap_lock); + + /* + * On the very unlikely chance that a swap device was added or + * removed between setting the "a" list bits and the ops init + * calls, we re-check and do init or invalidate for any changed + * bits. + */ + if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!test_bit(i, a) && test_bit(i, b)) + ops->init(i); + else if (test_bit(i, a) && !test_bit(i, b)) + ops->invalidate_area(i); + } + } } EXPORT_SYMBOL(frontswap_register_ops); @@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); void __frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; BUG_ON(sis == NULL); @@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); - if (frontswap_ops) - frontswap_ops->init(type); - else { - BUG_ON(type >= MAX_SWAPFILES); - set_bit(type, need_init); - } + + for_each_frontswap_ops(ops) + ops->init(type); } EXPORT_SYMBOL(__frontswap_init); bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { - bool ret = false; - - if (frontswap_ops && sis->frontswap_map) - ret = test_bit(offset, sis->frontswap_map); - return ret; + if (sis->frontswap_map) + return test_bit(offset, sis->frontswap_map); + return false; } EXPORT_SYMBOL(__frontswap_test); +static inline void __frontswap_set(struct swap_info_struct *sis, + pgoff_t offset) +{ + set_bit(offset, sis->frontswap_map); + atomic_inc(&sis->frontswap_pages); +} + static inline void __frontswap_clear(struct swap_info_struct *sis, - pgoff_t offset) + pgoff_t offset) { clear_bit(offset, sis->frontswap_map); atomic_dec(&sis->frontswap_pages); @@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis, */ int __frontswap_store(struct page *page) { - int ret = -1, dup = 0; + int ret = -1; swp_entry_t entry = { .val = page_private(page), }; int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + struct frontswap_ops *ops; /* * Return if no backend registed. * Don't need to inc frontswap_failed_stores here. */ if (!frontswap_ops) - return ret; + return -1; BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (__frontswap_test(sis, offset)) - dup = 1; - ret = frontswap_ops->store(type, offset, page); + + /* + * If a dup, we must remove the old page first; we can't leave the + * old page no matter if the store of the new page succeeds or fails, + * and we can't rely on the new page replacing the old page as we may + * not store to the same implementation that contains the old page. + */ + if (__frontswap_test(sis, offset)) { + __frontswap_clear(sis, offset); + for_each_frontswap_ops(ops) + ops->invalidate_page(type, offset); + } + + /* Try to store in each implementation, until one succeeds. */ + for_each_frontswap_ops(ops) { + ret = ops->store(type, offset, page); + if (!ret) /* successful store */ + break; + } if (ret == 0) { - set_bit(offset, sis->frontswap_map); + __frontswap_set(sis, offset); inc_frontswap_succ_stores(); - if (!dup) - atomic_inc(&sis->frontswap_pages); } else { - /* - failed dup always results in automatic invalidate of - the (older) page from frontswap - */ inc_frontswap_failed_stores(); - if (dup) { - __frontswap_clear(sis, offset); - frontswap_ops->invalidate_page(type, offset); - } } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ @@ -268,14 +301,22 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + struct frontswap_ops *ops; + + if (!frontswap_ops) + return -1; BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - /* - * __frontswap_test() will check whether there is backend registered - */ - if (__frontswap_test(sis, offset)) - ret = frontswap_ops->load(type, offset, page); + if (!__frontswap_test(sis, offset)) + return -1; + + /* Try loading from each implementation, until one succeeds. */ + for_each_frontswap_ops(ops) { + ret = ops->load(type, offset, page); + if (!ret) /* successful load */ + break; + } if (ret == 0) { inc_frontswap_loads(); if (frontswap_tmem_exclusive_gets_enabled) { @@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load); void __frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; + + if (!frontswap_ops) + return; BUG_ON(sis == NULL); - /* - * __frontswap_test() will check whether there is backend registered - */ - if (__frontswap_test(sis, offset)) { - frontswap_ops->invalidate_page(type, offset); - __frontswap_clear(sis, offset); - inc_frontswap_invalidates(); - } + if (!__frontswap_test(sis, offset)) + return; + + for_each_frontswap_ops(ops) + ops->invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); } EXPORT_SYMBOL(__frontswap_invalidate_page); @@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page); void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; - if (frontswap_ops) { - BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) - return; - frontswap_ops->invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - bitmap_zero(sis->frontswap_map, sis->max); - } - clear_bit(type, need_init); + if (!frontswap_ops) + return; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + + for_each_frontswap_ops(ops) + ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); } EXPORT_SYMBOL(__frontswap_invalidate_area); @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; - /* For mlock, just skip the stack guard page. */ - if ((*flags & FOLL_MLOCK) && + /* For mm_populate(), just skip the stack guard page. */ + if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || stack_guard_page_end(vma, address + PAGE_SIZE))) return -ENOENT; @@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages); /** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + +/** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * @@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * * for an example see gup_get_pte in arch/x86/mm/gup.c */ - pte_t pte = ACCESS_ONCE(*ptep); + pte_t pte = READ_ONCE(*ptep); struct page *page; /* @@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { - pgd_t pgd = ACCESS_ONCE(*pgdp); + pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6817b0350c71..097c7a4bfbd9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); static int khugepaged_slab_init(void); +static void khugepaged_slab_exit(void); #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!khugepaged_enabled()) - return 0; - for_each_populated_zone(zone) nr_zones++; @@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) setup_per_zone_wmarks(); return 0; } -late_initcall(set_recommended_min_free_kbytes); -static int start_khugepaged(void) +static int start_stop_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { @@ -156,6 +153,7 @@ static int start_khugepaged(void) pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; + goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) @@ -166,7 +164,7 @@ static int start_khugepaged(void) kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } - +fail: return err; } @@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void) struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); @@ -202,7 +200,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); } static void put_huge_zero_page(void) @@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, int err; mutex_lock(&khugepaged_mutex); - err = start_khugepaged(); + err = start_stop_khugepaged(); mutex_unlock(&khugepaged_mutex); if (err) @@ -634,27 +632,38 @@ static int __init hugepage_init(void) err = hugepage_init_sysfs(&hugepage_kobj); if (err) - return err; + goto err_sysfs; err = khugepaged_slab_init(); if (err) - goto out; + goto err_slab; - register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker); + if (err) + goto err_hzp_shrinker; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; + return 0; + } - start_khugepaged(); + err = start_stop_khugepaged(); + if (err) + goto err_khugepaged; return 0; -out: +err_khugepaged: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_slab_exit(); +err_slab: hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: return err; } subsys_initcall(hugepage_init); @@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *page) + struct page *page, gfp_t gfp) { struct mem_cgroup *memcg; pgtable_t pgtable; @@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) return VM_FAULT_OOM; pgtable = pte_alloc_one(mm, haddr); @@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1022,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ ptl = pmd_lockptr(mm, pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - gfp_t gfp; - - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); - new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1130,8 +1138,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1167,7 +1174,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) @@ -1389,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; /* * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_get_and_clear. So do the + * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { @@ -1452,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl)) { @@ -1498,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mkwrite(entry); @@ -1669,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page, /* after clearing PageTail the gup refcount can be released */ smp_mb__after_atomic(); - /* - * retain hwpoison flag of the poisoned tail page: - * fix for the unsuitable process killed on Guest Machine(KVM) - * by the memory-failure. - */ - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | @@ -1976,6 +1978,11 @@ static int __init khugepaged_slab_init(void) return 0; } +static void __init khugepaged_slab_exit(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -2109,7 +2116,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) { while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval)) + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) release_pte_page(pte_page(pteval)); } } @@ -2120,13 +2127,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int none = 0; + int none_or_zero = 0; bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2207,9 +2214,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, pte_t pteval = *_pte; struct page *src_page; - if (pte_none(pteval)) { + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); @@ -2311,8 +2330,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { @@ -2326,8 +2345,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( - khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2380,13 +2398,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); + return *hpage; } #endif @@ -2421,16 +2440,21 @@ static void collapse_huge_page(struct mm_struct *mm, struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* Only allocate from the target node */ + gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); if (!new_page) return; if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) + gfp, &memcg))) return; /* @@ -2470,7 +2494,7 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2543,7 +2567,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none = 0; + int ret = 0, none_or_zero = 0; struct page *page; unsigned long _address; spinlock_t *ptl; @@ -2561,8 +2585,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; @@ -2770,7 +2794,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2791,8 +2815,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; @@ -2836,7 +2858,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c41b2a0ee273..a8c3087089d8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,11 @@ int hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +/* + * Minimum page order among possible hugepage sizes, set to a proper value + * at boot time. + */ +static unsigned int minimum_order __read_mostly = UINT_MAX; __initdata LIST_HEAD(huge_boot_pages); @@ -61,6 +66,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -68,23 +76,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, free the subpool the subpool remain */ - if (free) + * remain, give up any reservations mased on minimum size and + * free the subpool */ + if (free) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); kfree(spool); + } } -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) { struct hugepage_subpool *spool; - spool = kmalloc(sizeof(*spool), GFP_KERNEL); + spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; - spool->max_hpages = nr_blocks; - spool->used_hpages = 0; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; return spool; } @@ -97,36 +118,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) unlock_or_release_subpool(spool); } -static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for allocating and reserving pages. + * Return -ENOMEM if there are not enough resources to satisfy the + * the request. Otherwise, return the number of pages by which the + * global pools must be adjusted (upward). The returned value may + * only be different than the passed value (delta) in the case where + * a subpool minimum size must be manitained. + */ +static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { - int ret = 0; + long ret = delta; if (!spool) - return 0; + return ret; spin_lock(&spool->lock); - if ((spool->used_hpages + delta) <= spool->max_hpages) { - spool->used_hpages += delta; - } else { - ret = -ENOMEM; + + if (spool->max_hpages != -1) { /* maximum size accounting */ + if ((spool->used_hpages + delta) <= spool->max_hpages) + spool->used_hpages += delta; + else { + ret = -ENOMEM; + goto unlock_ret; + } + } + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (delta > spool->rsv_hpages) { + /* + * Asking for more reserves than those already taken on + * behalf of subpool. Return difference. + */ + ret = delta - spool->rsv_hpages; + spool->rsv_hpages = 0; + } else { + ret = 0; /* reserves already accounted for */ + spool->rsv_hpages -= delta; + } } - spin_unlock(&spool->lock); +unlock_ret: + spin_unlock(&spool->lock); return ret; } -static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for freeing and unreserving pages. + * Return the number of global page reservations that must be dropped. + * The return value may only be different than the passed value (delta) + * in the case where a subpool minimum size must be maintained. + */ +static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { + long ret = delta; + if (!spool) - return; + return delta; spin_lock(&spool->lock); - spool->used_hpages -= delta; - /* If hugetlbfs_put_super couldn't free spool due to - * an outstanding quota reference, free it now. */ + + if (spool->max_hpages != -1) /* maximum size accounting */ + spool->used_hpages -= delta; + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (spool->rsv_hpages + delta <= spool->min_hpages) + ret = 0; + else + ret = spool->rsv_hpages + delta - spool->min_hpages; + + spool->rsv_hpages += delta; + if (spool->rsv_hpages > spool->min_hpages) + spool->rsv_hpages = spool->min_hpages; + } + + /* + * If hugetlbfs_put_super couldn't free spool due to an outstanding + * quota reference, free it now. + */ unlock_or_release_subpool(spool); + + return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) @@ -143,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are embedded into a resv_map and - * protected by a resv_map's lock + * The region data structures are embedded into a resv_map and protected + * by a resv_map's lock. The set of regions within the resv_map represent + * reservations for huge pages, or huge pages that have already been + * instantiated within the map. The from and to elements are huge page + * indicies into the associated mapping. from indicates the starting index + * of the region. to represents the first index past the end of the region. + * + * For example, a file region structure with from == 0 and to == 4 represents + * four huge pages in a mapping. It is important to note that the to element + * represents the first element past the end of the region. This is used in + * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. + * + * Interval notation of the form [from, to) will be used to indicate that + * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; @@ -152,10 +238,22 @@ struct file_region { long to; }; +/* + * Add the huge page range represented by [f, t) to the reserve + * map. Existing regions will be expanded to accommodate the + * specified range. We know only existing regions need to be + * expanded, because region_add is only called after region_chg + * with the same range. If a new file_region structure must + * be allocated, it is done in region_chg. + * + * Return the number of new huge pages added to the map. This + * number is greater than or equal to zero. + */ static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + long add = 0; spin_lock(&resv->lock); /* Locate the region we are either in or before. */ @@ -181,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t) if (rg->to > t) t = rg->to; if (rg != nrg) { + /* Decrement return value by the deleted range. + * Another range will span this area so that by + * end of routine add will be >= zero + */ + add -= (rg->to - rg->from); list_del(&rg->link); kfree(rg); } } + + add += (nrg->from - f); /* Added to beginning of region */ nrg->from = f; + add += t - nrg->to; /* Added to end of region */ nrg->to = t; + spin_unlock(&resv->lock); - return 0; + VM_BUG_ON(add < 0); + return add; } +/* + * Examine the existing reserve map and determine how many + * huge pages in the specified range [f, t) are NOT currently + * represented. This routine is called before a subsequent + * call to region_add that will actually modify the reserve + * map to add the specified range [f, t). region_chg does + * not change the number of huge pages represented by the + * map. However, if the existing regions in the map can not + * be expanded to represent the new range, a new file_region + * structure is added to the map as a placeholder. This is + * so that the subsequent region_add call will have all the + * regions it needs and will not fail. + * + * Returns the number of huge pages that need to be added + * to the existing reservation map for the range [f, t). + * This number is greater or equal to zero. -ENOMEM is + * returned if a new file_region structure is needed and can + * not be allocated. + */ static long region_chg(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; @@ -257,6 +384,11 @@ out_nrg: return chg; } +/* + * Truncate the reserve map at index 'end'. Modify/truncate any + * region which contains end. Delete any regions past end. + * Return the number of huge pages removed from the map. + */ static long region_truncate(struct resv_map *resv, long end) { struct list_head *head = &resv->regions; @@ -292,6 +424,10 @@ out: return chg; } +/* + * Count and return the number of huge pages in the reserve map + * that intersect with the range [f, t). + */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; @@ -839,7 +975,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { - arch_release_hugepage(page); __free_pages(page, huge_page_order(h)); } } @@ -855,6 +990,31 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } +/* + * Test to determine whether the hugepage is "active/in-use" (i.e. being linked + * to hstate->hugepage_activelist.) + * + * This function can be called for tail pages, but never returns true for them. + */ +bool page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHuge(page), page); + return PageHead(page) && PagePrivate(&page[1]); +} + +/* never called for tail page */ +static void set_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + SetPagePrivate(&page[1]); +} + +static void clear_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + ClearPagePrivate(&page[1]); +} + void free_huge_page(struct page *page) { /* @@ -874,7 +1034,16 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page); + /* + * A return code of zero implies that the subpool will be under its + * minimum size if the reservation is not restored after page is free. + * Therefore, force restore_reserve operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + spin_lock(&hugetlb_lock); + clear_page_huge_active(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); if (restore_reserve) @@ -891,7 +1060,6 @@ void free_huge_page(struct page *page) enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -991,10 +1159,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { - if (arch_prepare_hugepage(page)) { - __free_pages(page, huge_page_order(h)); - return NULL; - } prep_new_huge_page(h, page, nid); } @@ -1086,19 +1250,13 @@ static void dissolve_free_huge_page(struct page *page) */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned int order = 8 * sizeof(void *); unsigned long pfn; - struct hstate *h; if (!hugepages_supported()) return; - /* Set scan step to minimum hugepage size */ - for_each_hstate(h) - if (order > huge_page_order(h)) - order = huge_page_order(h); - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } @@ -1152,11 +1310,6 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); - if (page && arch_prepare_hugepage(page)) { - __free_pages(page, huge_page_order(h)); - page = NULL; - } - spin_lock(&hugetlb_lock); if (page) { INIT_LIST_HEAD(&page->lru); @@ -1321,46 +1474,56 @@ static void return_unused_surplus_pages(struct hstate *h, } /* - * Determine if the huge page at addr within the vma has an associated - * reservation. Where it does not we will need to logically increase - * reservation and actually increase subpool usage before an allocation - * can occur. Where any new reservation would be required the - * reservation change is prepared, but not committed. Once the page - * has been allocated from the subpool and instantiated the change should - * be committed via vma_commit_reservation. No action is required on - * failure. + * vma_needs_reservation and vma_commit_reservation are used by the huge + * page allocation routines to manage reservations. + * + * vma_needs_reservation is called to determine if the huge page at addr + * within the vma has an associated reservation. If a reservation is + * needed, the value 1 is returned. The caller is then responsible for + * managing the global reservation and subpool usage counts. After + * the huge page has been allocated, vma_commit_reservation is called + * to add the page to the reservation map. + * + * In the normal case, vma_commit_reservation returns the same value + * as the preceding vma_needs_reservation call. The only time this + * is not the case is if a reserve map was changed between calls. It + * is the responsibility of the caller to notice the difference and + * take appropriate action. */ -static long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr) +static long __vma_reservation_common(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, + bool commit) { struct resv_map *resv; pgoff_t idx; - long chg; + long ret; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); - chg = region_chg(resv, idx, idx + 1); + if (commit) + ret = region_add(resv, idx, idx + 1); + else + ret = region_chg(resv, idx, idx + 1); if (vma->vm_flags & VM_MAYSHARE) - return chg; + return ret; else - return chg < 0 ? chg : 0; + return ret < 0 ? ret : 0; } -static void vma_commit_reservation(struct hstate *h, + +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct resv_map *resv; - pgoff_t idx; - - resv = vma_resv_map(vma); - if (!resv) - return; + return __vma_reservation_common(h, vma, addr, false); +} - idx = vma_hugecache_offset(h, vma, addr); - region_add(resv, idx, idx + 1); +static long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, true); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1369,7 +1532,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - long chg; + long chg, commit; int ret, idx; struct hugetlb_cgroup *h_cg; @@ -1386,7 +1549,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (chg < 0) return ERR_PTR(-ENOMEM); if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1)) + if (hugepage_subpool_get_pages(spool, 1) < 0) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); @@ -1410,7 +1573,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); - vma_commit_reservation(h, vma, addr); + commit = vma_commit_reservation(h, vma, addr); + if (unlikely(chg > commit)) { + /* + * The page was added to the reservation map between + * vma_needs_reservation and vma_commit_reservation. + * This indicates a race with hugetlb_reserve_pages. + * Adjust for the subpool count incremented above AND + * in hugetlb_reserve_pages for the same page. Also, + * the reservation count added in hugetlb_reserve_pages + * no longer applies. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + } return page; out_uncharge_cgroup: @@ -1525,10 +1703,14 @@ static void __init hugetlb_init_hstates(void) struct hstate *h; for_each_hstate(h) { + if (minimum_order > huge_page_order(h)) + minimum_order = huge_page_order(h); + /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } + VM_BUG_ON(minimum_order == UINT_MAX); } static char * __init memfmt(char *buf, unsigned long n) @@ -2454,6 +2636,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; + long gbl_reserve; if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -2466,8 +2649,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&resv->refs, resv_map_release); if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); + /* + * Decrement reserve counts. The global reserve count may be + * adjusted if the subpool has a minimum size. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, reserve); + hugetlb_acct_memory(h, -gbl_reserve); } } @@ -2891,6 +3078,7 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); + set_page_huge_active(new_page); mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); @@ -3003,6 +3191,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); + set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { int err; @@ -3278,6 +3467,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) { + remainder = 0; + break; + } + + /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. @@ -3438,6 +3636,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; + long gbl_reserve; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3474,8 +3673,13 @@ int hugetlb_reserve_pages(struct inode *inode, goto out_err; } - /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) { + /* + * There must be enough pages in the subpool for the mapping. If + * the subpool has a minimum size, there may be some global + * reservations already in place (gbl_reserve). + */ + gbl_reserve = hugepage_subpool_get_pages(spool, chg); + if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; } @@ -3484,9 +3688,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, chg); + ret = hugetlb_acct_memory(h, gbl_reserve); if (ret < 0) { - hugepage_subpool_put_pages(spool, chg); + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); goto out_err; } @@ -3501,8 +3706,24 @@ int hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(resv_map, from, to); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + long add = region_add(resv_map, from, to); + + if (unlikely(chg > add)) { + /* + * pages in this range were added to the reserve + * map between region_chg and region_add. This + * indicates a race with alloc_huge_page. Adjust + * the subpool and reserve counts modified above + * based on the difference. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, + chg - add); + hugetlb_acct_memory(h, -rsv_adjust); + } + } return 0; out_err: if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) @@ -3516,6 +3737,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + long gbl_reserve; if (resv_map) chg = region_truncate(resv_map, offset); @@ -3523,8 +3745,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -(chg - freed)); + /* + * If the subpool has a minimum size, the number of global + * reservations to be released may be adjusted. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -gbl_reserve); } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -3659,6 +3885,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { return NULL; } + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ @@ -3735,8 +3966,7 @@ retry: if (!pmd_huge(*pmd)) goto out; if (pmd_present(*pmd)) { - page = pte_page(*(pte_t *)pmd) + - ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { @@ -3767,20 +3997,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE -/* Should be called in hugetlb_lock */ -static int is_hugepage_on_freelist(struct page *hpage) -{ - struct page *page; - struct page *tmp; - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - - list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) - if (page == hpage) - return 1; - return 0; -} - /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. @@ -3792,7 +4008,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) int ret = -EBUSY; spin_lock(&hugetlb_lock); - if (is_hugepage_on_freelist(hpage)) { + /* + * Just checking !page_huge_active is not enough, because that could be + * an isolated/hwpoisoned hugepage (which have >0 refcount). + */ + if (!page_huge_active(hpage) && !page_count(hpage)) { /* * Hwpoisoned hugepage isn't linked to activelist or freelist, * but dangling hpage->lru can trigger list-debug warnings @@ -3812,42 +4032,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) bool isolate_huge_page(struct page *page, struct list_head *list) { + bool ret = true; + VM_BUG_ON_PAGE(!PageHead(page), page); - if (!get_page_unless_zero(page)) - return false; spin_lock(&hugetlb_lock); + if (!page_huge_active(page) || !get_page_unless_zero(page)) { + ret = false; + goto unlock; + } + clear_page_huge_active(page); list_move_tail(&page->lru, list); +unlock: spin_unlock(&hugetlb_lock); - return true; + return ret; } void putback_active_hugepage(struct page *page) { VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); + set_page_huge_active(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); } - -bool is_hugepage_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHuge(page), page); - /* - * This function can be called for a tail page because the caller, - * scan_movable_pages, scans through a given pfn-range which typically - * covers one memory block. In systems using gigantic hugepage (1GB - * for x86_64,) a hugepage is larger than a memory block, and we don't - * support migrating such large hugepages for now, so return false - * when called for tail pages. - */ - if (PageTail(page)) - return false; - /* - * Refcount of a hwpoisoned hugepages is 1, but they are not active, - * so we should return false for them. - */ - if (unlikely(PageHWPoison(page))) - return false; - return page_count(page) > 0; -} diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 329caf56df22..bf73ac17dad4 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -28,19 +28,19 @@ static int hwpoison_inject(void *data, u64 val) /* * This implies unable to support free buddy pages. */ - if (!get_page_unless_zero(hpage)) + if (!get_hwpoison_page(p)) return 0; if (!hwpoison_filter_enable) goto inject; - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); + if (!PageLRU(hpage) && !PageHuge(p)) + shake_page(hpage, 0); /* * This implies unable to support non-LRU pages. */ - if (!PageLRU(p) && !PageHuge(p)) - return 0; + if (!PageLRU(hpage) && !PageHuge(p)) + goto put_out; /* * do a racy check with elevated page count, to make sure PG_hwpoison @@ -52,11 +52,14 @@ static int hwpoison_inject(void *data, u64 val) err = hwpoison_filter(hpage); unlock_page(hpage); if (err) - return 0; + goto put_out; inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); +put_out: + put_page(p); + return 0; } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/internal.h b/mm/internal.h index a96da5b0029d..36b23f1e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); @@ -200,6 +201,8 @@ isolate_freepages_range(struct compact_control *cc, unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal); #endif @@ -222,13 +225,13 @@ static inline unsigned long page_order(struct page *page) * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * - * ACCESS_ONCE is used so that if the caller assigns the result into a local + * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ -#define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) +#define page_order_unsafe(page) READ_ONCE(page_private(page)) static inline bool is_cow_mapping(vm_flags_t flags) { @@ -240,7 +243,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long __mlock_vma_pages_range(struct vm_area_struct *vma, +extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -359,10 +362,7 @@ do { \ } while (0) extern void mminit_verify_pageflags_layout(void); -extern void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn); extern void mminit_verify_zonelist(void); - #else static inline void mminit_dprintk(enum mminit_level level, @@ -374,11 +374,6 @@ static inline void mminit_verify_pageflags_layout(void) { } -static inline void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn) -{ -} - static inline void mminit_verify_zonelist(void) { } diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 936d81661c47..6c513a63ea84 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size) kasan_kmalloc(page->slab_cache, object, size); } +void kasan_kfree(void *ptr) +{ + struct page *page; + + page = virt_to_head_page(ptr); + + if (unlikely(!PageSlab(page))) + kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), + KASAN_FREE_PAGE); + else + kasan_slab_free(page->slab_cache, ptr); +} + void kasan_kfree_large(const void *ptr) { struct page *page = virt_to_page(ptr); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4986b0acab21..c242adf6bc85 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,7 +7,6 @@ #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) #define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5405aff5a590..cf79f110157c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -53,6 +53,13 @@ * modifications to the memory scanning parameters including the scan_thread * pointer * + * Locks and mutexes are acquired/nested in the following order: + * + * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING) + * + * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex + * regions. + * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes * 0, this count can no longer be incremented and put_object() schedules the @@ -115,7 +122,8 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOACCOUNT)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) @@ -194,6 +202,8 @@ static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ static int kmemleak_enabled; +/* same as above but only for the kmemleak_free() callback */ +static int kmemleak_free_enabled; /* set in the late_initcall if there were no errors */ static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ @@ -482,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) rcu_read_lock(); read_lock_irqsave(&kmemleak_lock, flags); - if (ptr >= min_addr && ptr < max_addr) - object = lookup_object(ptr, alias); + object = lookup_object(ptr, alias); read_unlock_irqrestore(&kmemleak_lock, flags); /* check whether the object is still available */ @@ -495,6 +504,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Look up an object in the object search tree and remove it from both + * object_tree_root and object_list. The returned object's use_count should be + * at least 1, as initially set by create_object(). + */ +static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias) +{ + unsigned long flags; + struct kmemleak_object *object; + + write_lock_irqsave(&kmemleak_lock, flags); + object = lookup_object(ptr, alias); + if (object) { + rb_erase(&object->rb_node, &object_tree_root); + list_del_rcu(&object->object_list); + } + write_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* * Save stack trace to the given array of MAX_TRACE size. */ static int __save_stack_trace(unsigned long *trace) @@ -579,11 +609,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, kmemleak_stop("Cannot insert 0x%lx into the object " "search tree (overlaps existing)\n", ptr); + /* + * No need for parent->lock here since "parent" cannot + * be freed while the kmemleak_lock is held. + */ + dump_object_info(parent); kmem_cache_free(object_cache, object); - object = parent; - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); + object = NULL; goto out; } } @@ -597,20 +629,14 @@ out: } /* - * Remove the metadata (struct kmemleak_object) for a memory block from the - * object_list and object_tree_root and decrement its use_count. + * Mark the object as not allocated and schedule RCU freeing via put_object(). */ static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - write_lock_irqsave(&kmemleak_lock, flags); - rb_erase(&object->rb_node, &object_tree_root); - list_del_rcu(&object->object_list); - write_unlock_irqrestore(&kmemleak_lock, flags); - WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 2); + WARN_ON(atomic_read(&object->use_count) < 1); /* * Locking here also ensures that the corresponding memory block @@ -630,7 +656,7 @@ static void delete_object_full(unsigned long ptr) { struct kmemleak_object *object; - object = find_and_get_object(ptr, 0); + object = find_and_remove_object(ptr, 0); if (!object) { #ifdef DEBUG kmemleak_warn("Freeing unknown object at 0x%08lx\n", @@ -639,7 +665,6 @@ static void delete_object_full(unsigned long ptr) return; } __delete_object(object); - put_object(object); } /* @@ -652,7 +677,7 @@ static void delete_object_part(unsigned long ptr, size_t size) struct kmemleak_object *object; unsigned long start, end; - object = find_and_get_object(ptr, 1); + object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx " @@ -660,7 +685,6 @@ static void delete_object_part(unsigned long ptr, size_t size) #endif return; } - __delete_object(object); /* * Create one or two objects that may result from the memory block @@ -678,7 +702,7 @@ static void delete_object_part(unsigned long ptr, size_t size) create_object(ptr + size, end - ptr - size, object->min_count, GFP_KERNEL); - put_object(object); + __delete_object(object); } static void __paint_it(struct kmemleak_object *object, int color) @@ -906,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); * kmemleak_alloc_percpu - register a newly allocated __percpu object * @ptr: __percpu pointer to beginning of the object * @size: size of the object + * @gfp: flags used for kmemleak internal memory allocations * * This function is called from the kernel percpu allocator when a new object - * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL - * allocation. + * (memory block) is allocated (alloc_percpu). */ -void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) +void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) { unsigned int cpu; @@ -924,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), - size, 0, GFP_KERNEL); + size, 0, gfp); else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } @@ -941,7 +966,7 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); @@ -981,7 +1006,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); @@ -1147,19 +1172,18 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned, int allow_resched) + struct kmemleak_object *scanned) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); + unsigned long flags; + read_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; - unsigned long flags; unsigned long pointer; - if (allow_resched) - cond_resched(); if (scan_should_stop()) break; @@ -1172,26 +1196,31 @@ static void scan_block(void *_start, void *_end, pointer = *ptr; kasan_enable_current(); - object = find_and_get_object(pointer, 1); + if (pointer < min_addr || pointer >= max_addr) + continue; + + /* + * No need for get_object() here since we hold kmemleak_lock. + * object->use_count cannot be dropped to 0 while the object + * is still present in object_tree_root and object_list + * (with updates protected by kmemleak_lock). + */ + object = lookup_object(pointer, 1); if (!object) continue; - if (object == scanned) { + if (object == scanned) /* self referenced, ignore */ - put_object(object); continue; - } /* * Avoid the lockdep recursive warning on object->lock being * previously acquired in scan_object(). These locks are * enclosed by scan_mutex. */ - spin_lock_irqsave_nested(&object->lock, flags, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); if (!color_white(object)) { /* non-orphan, ignored or new */ - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); + spin_unlock(&object->lock); continue; } @@ -1203,13 +1232,27 @@ static void scan_block(void *_start, void *_end, */ object->count++; if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); list_add_tail(&object->gray_list, &gray_list); - spin_unlock_irqrestore(&object->lock, flags); - continue; } + spin_unlock(&object->lock); + } + read_unlock_irqrestore(&kmemleak_lock, flags); +} - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); +/* + * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. + */ +static void scan_large_block(void *start, void *end) +{ + void *next; + + while (start < end) { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, NULL); + start = next; + cond_resched(); } } @@ -1235,22 +1278,25 @@ static void scan_object(struct kmemleak_object *object) if (hlist_empty(&object->area_list)) { void *start = (void *)object->pointer; void *end = (void *)(object->pointer + object->size); + void *next; - while (start < end && (object->flags & OBJECT_ALLOCATED) && - !(object->flags & OBJECT_NO_SCAN)) { - scan_block(start, min(start + MAX_SCAN_SIZE, end), - object, 0); - start += MAX_SCAN_SIZE; + do { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, object); + + start = next; + if (start >= end) + break; spin_unlock_irqrestore(&object->lock, flags); cond_resched(); spin_lock_irqsave(&object->lock, flags); - } + } while (object->flags & OBJECT_ALLOCATED); } else hlist_for_each_entry(area, &object->area_list, node) scan_block((void *)area->start, (void *)(area->start + area->size), - object, 0); + object); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -1327,14 +1373,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL, 1); - scan_block(__bss_start, __bss_stop, NULL, 1); + scan_large_block(_sdata, _edata); + scan_large_block(__bss_start, __bss_stop); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) - scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL, 1); + scan_large_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i)); #endif /* @@ -1355,7 +1401,7 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL, 1); + scan_block(page, page + 1, NULL); } } put_online_mems(); @@ -1369,7 +1415,7 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL, 0); + THREAD_SIZE, NULL); } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -1746,15 +1792,20 @@ static void __kmemleak_do_cleanup(void) */ static void kmemleak_do_cleanup(struct work_struct *work) { - mutex_lock(&scan_mutex); stop_scan_thread(); + /* + * Once the scan thread has stopped, it is safe to no longer track + * object freeing. Ordering of the scan thread stopping and the memory + * accesses below is guaranteed by the kthread_stop() function. + */ + kmemleak_free_enabled = 0; + if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else pr_info("Kmemleak disabled without freeing internal data. " "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); - mutex_unlock(&scan_mutex); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); @@ -1775,6 +1826,8 @@ static void kmemleak_disable(void) /* check whether it is too early for a kernel thread */ if (kmemleak_initialized) schedule_work(&cleanup_work); + else + kmemleak_free_enabled = 0; pr_info("Kernel memory leak detector disabled\n"); } @@ -1839,8 +1892,10 @@ void __init kmemleak_init(void) if (kmemleak_error) { local_irq_restore(flags); return; - } else + } else { kmemleak_enabled = 1; + kmemleak_free_enabled = 1; + } local_irq_restore(flags); /* @@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) expected_mapping = (void *)stable_node + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); again: - kpfn = ACCESS_ONCE(stable_node->kpfn); + kpfn = READ_ONCE(stable_node->kpfn); page = pfn_to_page(kpfn); /* @@ -551,7 +551,7 @@ again: * but on Alpha we need to be more careful. */ smp_read_barrier_depends(); - if (ACCESS_ONCE(page->mapping) != expected_mapping) + if (READ_ONCE(page->mapping) != expected_mapping) goto stale; /* @@ -577,14 +577,14 @@ again: cpu_relax(); } - if (ACCESS_ONCE(page->mapping) != expected_mapping) { + if (READ_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } if (lock_it) { lock_page(page); - if (ACCESS_ONCE(page->mapping) != expected_mapping) { + if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); goto stale; @@ -600,7 +600,7 @@ stale: * before checking whether node->kpfn has been changed. */ smp_rmb(); - if (ACCESS_ONCE(stable_node->kpfn) != kpfn) + if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; diff --git a/mm/madvise.c b/mm/madvise.c index d551475517bf..64bb8a22110c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> diff --git a/mm/memblock.c b/mm/memblock.c index 252b77bdf65e..87108e77e476 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock; #ifdef CONFIG_MOVABLE_NODE bool movable_node_enabled __initdata_memblock = false; #endif +static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; +ulong __init_memblock choose_memblock_flags(void) +{ + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; +} + /* inline so we don't get a warning when pr_debug is compiled out */ static __init_memblock const char * memblock_type_name(struct memblock_type *type) @@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, + NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Find @size free area aligned to @align in the specified range and node. * @@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t kernel_end, ret; @@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /* ok, try bottom-up allocation first */ ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid); + size, align, nid, flags); if (ret) return ret; @@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, "memory hotunplug may be affected\n"); } - return __memblock_find_range_top_down(start, end, size, align, nid); + return __memblock_find_range_top_down(start, end, size, align, nid, + flags); } /** @@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(size, align, start, end, - NUMA_NO_NODE); + phys_addr_t ret; + ulong flags = choose_memblock_flags(); + +again: + ret = memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + + return ret; } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -580,10 +606,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, return memblock_add_range(&memblock.memory, base, size, nid, 0); } +static int __init_memblock memblock_add_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) +{ + struct memblock_type *_rgn = &memblock.memory; + + memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", + (unsigned long long)base, + (unsigned long long)base + size - 1, + flags, (void *)_RET_IP_); + + return memblock_add_range(_rgn, base, size, nid, flags); +} + int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_range(&memblock.memory, base, size, - MAX_NUMNODES, 0); + return memblock_add_region(base, size, MAX_NUMNODES, 0); } /** @@ -699,14 +739,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *_rgn = &memblock.reserved; + struct memblock_type *type = &memblock.reserved; memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(_rgn, base, size, nid, flags); + return memblock_add_range(type, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) @@ -765,9 +805,57 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** + * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) +{ + system_has_some_mirror = true; + + return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); +} + + +/** + * __next_reserved_mem_region - next function for for_each_reserved_region() + * @idx: pointer to u64 loop variable + * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL + * + * Iterate over all reserved memory regions. + */ +void __init_memblock __next_reserved_mem_region(u64 *idx, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + struct memblock_type *rsv = &memblock.reserved; + + if (*idx >= 0 && *idx < rsv->cnt) { + struct memblock_region *r = &rsv->regions[*idx]; + phys_addr_t base = r->base; + phys_addr_t size = r->size; + + if (out_start) + *out_start = base; + if (out_end) + *out_end = base + size - 1; + + *idx += 1; + return; + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -789,7 +877,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, +void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -817,6 +905,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -881,6 +973,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * @idx: pointer to u64 loop variable * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -889,7 +982,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -921,6 +1014,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1036,14 +1133,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, start, end, nid); + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); if (found && !memblock_reserve(found, size)) { /* * The min_count is set to 0 so that memblock allocations are @@ -1056,26 +1154,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, } phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end) + phys_addr_t start, phys_addr_t end, + ulong flags) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + flags); } static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid) + int nid, ulong flags) { - return memblock_alloc_range_nid(size, align, 0, max_addr, nid); + return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + ulong flags = choose_memblock_flags(); + phys_addr_t ret; + +again: + ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + return ret; } phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, + MEMBLOCK_NONE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -1139,6 +1251,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; + ulong flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1159,13 +1272,14 @@ static void * __init memblock_virt_alloc_internal( again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid); + nid, flags); if (alloc) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, - max_addr, NUMA_NO_NODE); + max_addr, NUMA_NO_NODE, + flags); if (alloc) goto done; } @@ -1173,10 +1287,16 @@ again: if (min_addr) { min_addr = 0; goto again; - } else { - goto error; } + if (flags & MEMBLOCK_MIRROR) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } + + return NULL; done: memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); @@ -1191,9 +1311,6 @@ done: kmemleak_alloc(ptr, size, 0, 0); return ptr; - -error: - return NULL; } /** @@ -1302,7 +1419,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b34ef4a32a3b..acb93c554f6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -14,6 +14,12 @@ * Copyright (C) 2012 Parallels Inc. and Google Inc. * Authors: Glauber Costa and Suleiman Souhlal * + * Native page reclaim + * Charge lifetime sanitation + * Lockless page tracking & accounting + * Unified hierarchy configuration model + * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -71,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; +struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -84,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = { "rss", "rss_huge", "mapped_file", + "dirty", "writeback", "swap", }; @@ -253,11 +261,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. - * - * TODO: Add a water mark for the memory controller. Reclaim will begin when - * we hit the water mark. May be even add a low water mark, such that - * no reclaim occurs from a cgroup at it's low water mark, this is - * a feature that will be implemented much later in the future. */ struct mem_cgroup { struct cgroup_subsys_state css; @@ -284,9 +287,9 @@ struct mem_cgroup { */ bool use_hierarchy; + /* protected by memcg_oom_lock */ bool oom_lock; - atomic_t under_oom; - atomic_t oom_wakeups; + int under_oom; int swappiness; /* OOM-Killer disable */ @@ -321,11 +324,6 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; - /* - * used when a cpu is offlined or other synchronizations - * See mem_cgroup_read_stat(). - */ - struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) @@ -345,6 +343,11 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; + struct wb_domain cgwb_domain; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -454,6 +457,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) return memcg->css.id; } +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) + */ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; @@ -589,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) return &memcg->css; } +/** + * mem_cgroup_css_from_page - css of the memcg associated with a page + * @page: page of interest + * + * If memcg is bound to the default hierarchy, css of the memcg associated + * with @page is returned. The returned css remains associated with @page + * until it is released. + * + * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup + * is returned. + * + * XXX: The above description of behavior on the default hierarchy isn't + * strictly true yet as replace_page_cache_page() can modify the + * association before @page is released even on the default hierarchy; + * however, the current and planned usages don't mix the the two functions + * and replace_page_cache_page() will soon be updated to make the invariant + * actually true. + */ +struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + + memcg = page->mem_cgroup; + + if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) + memcg = root_mem_cgroup; + + rcu_read_unlock(); + return &memcg->css; +} + static struct mem_cgroup_per_zone * mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { @@ -667,7 +709,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; if (nr_pages > soft_limit) @@ -788,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->count[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.count[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -806,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.events[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -1035,7 +1063,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, goto out_unlock; do { - pos = ACCESS_ONCE(iter->position); + pos = READ_ONCE(iter->position); /* * A racing update may change the position and * put the last reference, hence css_tryget(), @@ -1352,13 +1380,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = ACCESS_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.limit); if (count < limit) margin = limit - count; if (do_swap_account) { count = page_counter_read(&memcg->memsw); - limit = ACCESS_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) margin = min(margin, limit - count); } @@ -1436,15 +1464,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) struct mem_cgroup *iter; unsigned int i; - if (!p) - return; - mutex_lock(&oom_info_lock); rcu_read_lock(); - pr_info("Task in "); - pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); + if (p) { + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_cont(" killed as a result of limit of "); + } else { + pr_info("Memory limit reached of cgroup "); + } + pr_cont_cgroup_path(memcg->css.cgroup); pr_cont("\n"); @@ -1521,17 +1551,19 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int points = 0; struct task_struct *chosen = NULL; + mutex_lock(&oom_lock); + /* * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - mark_tsk_oom_victim(current); - return; + mark_oom_victim(current); + goto unlock; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1555,7 +1587,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); - return; + goto unlock; case OOM_SCAN_OK: break; }; @@ -1576,11 +1608,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_end(&it); } - if (!chosen) - return; - points = chosen_points * 1000 / totalpages; - oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, - NULL, "Memory cgroup out of memory"); + if (chosen) { + points = chosen_points * 1000 / totalpages; + oom_kill_process(chosen, gfp_mask, order, points, totalpages, + memcg, NULL, "Memory cgroup out of memory"); + } +unlock: + mutex_unlock(&oom_lock); } #if MAX_NUMNODES > 1 @@ -1797,8 +1831,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_inc(&iter->under_oom); + iter->under_oom++; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) @@ -1807,11 +1843,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) /* * When a new child is created while the hierarchy is under oom, - * mem_cgroup_oom_lock() may not be called. We have to use - * atomic_add_unless() here. + * mem_cgroup_oom_lock() may not be called. Watch for underflow. */ + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_add_unless(&iter->under_oom, -1, 0); + if (iter->under_oom > 0) + iter->under_oom--; + spin_unlock(&memcg_oom_lock); } static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1837,17 +1875,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait, return autoremove_wake_function(wait, mode, sync, arg); } -static void memcg_wakeup_oom(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->oom_wakeups); - /* for filtering, pass "memcg" as argument. */ - __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); -} - static void memcg_oom_recover(struct mem_cgroup *memcg) { - if (memcg && atomic_read(&memcg->under_oom)) - memcg_wakeup_oom(memcg); + /* + * For the following lockless ->under_oom test, the only required + * guarantee is that it must see the state asserted by an OOM when + * this function is called as a result of userland actions + * triggered by the notification of the OOM. This is trivially + * achieved by invoking mem_cgroup_mark_under_oom() before + * triggering notification. + */ + if (memcg && memcg->under_oom) + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) @@ -2002,6 +2041,7 @@ again: return memcg; } +EXPORT_SYMBOL(mem_cgroup_begin_page_stat); /** * mem_cgroup_end_page_stat - finish a page state statistics transaction @@ -2020,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } +EXPORT_SYMBOL(mem_cgroup_end_page_stat); /** * mem_cgroup_update_page_stat - update page state statistics @@ -2160,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -/* - * This function drains percpu counter value from DEAD cpu and - * move it to local cpu. Note that this function can be preempted. - */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) -{ - int i; - - spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long x = per_cpu(memcg->stat->count[i], cpu); - - per_cpu(memcg->stat->count[i], cpu) = 0; - memcg->nocpu_base.count[i] += x; - } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(memcg->stat->events[i], cpu); - - per_cpu(memcg->stat->events[i], cpu) = 0; - memcg->nocpu_base.events[i] += x; - } - spin_unlock(&memcg->pcp_counter_lock); -} - static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - struct mem_cgroup *iter; if (action == CPU_ONLINE) return NOTIFY_OK; @@ -2198,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup(iter) - mem_cgroup_drain_pcp_counter(iter, cpu); - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; @@ -2314,6 +2327,8 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + if (!(gfp_mask & __GFP_WAIT)) + goto done; /* * If the hierarchy is above the normal consumption range, * make the charging task trim their excess contribution. @@ -2341,20 +2356,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) } /* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) -{ - /* ID 0 is unused ID */ - if (!id) - return NULL; - return mem_cgroup_from_id(id); -} - -/* * try_get_mem_cgroup_from_page - look up page's memcg association * @page: the page * @@ -2380,7 +2381,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); + memcg = mem_cgroup_from_id(id); if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); @@ -2642,7 +2643,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) return cachep; memcg = get_mem_cgroup_from_mm(current->mm); - kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2779,92 +2780,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * mem_cgroup_move_account - move account of the page - * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. - * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 - * - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" - * from old cgroup. - */ -static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, - struct mem_cgroup *from, - struct mem_cgroup *to) -{ - unsigned long flags; - int ret; - - VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; - - /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. - */ - if (!trylock_page(page)) - goto out; - - ret = -EINVAL; - if (page->mem_cgroup != from) - goto out_unlock; - - spin_lock_irqsave(&from->move_lock, flags); - - if (!PageAnon(page) && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - } - - if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - } - - /* - * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. - */ - - /* caller should have done css_get */ - page->mem_cgroup = to; - spin_unlock_irqrestore(&from->move_lock, flags); - - ret = 0; - - local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); - local_irq_enable(); -out_unlock: - unlock_page(page); -out: - return ret; -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -3953,7 +3868,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->under_oom)) + if (memcg->under_oom) eventfd_signal(eventfd, 1); spin_unlock(&memcg_oom_lock); @@ -3982,7 +3897,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); - seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); + seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); return 0; } @@ -4084,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) +{ + return &memcg->cgwb_list; +} + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +/** + * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg + * @wb: bdi_writeback in question + * @pavail: out parameter for number of available pages + * @pdirty: out parameter for number of dirty pages + * @pwriteback: out parameter for number of pages under writeback + * + * Determine the numbers of available, dirty, and writeback pages in @wb's + * memcg. Dirty and writeback are self-explanatory. Available is a bit + * more involved. + * + * A memcg's headroom is "min(max, high) - used". The available memory is + * calculated as the lowest headroom of itself and the ancestors plus the + * number of pages already being used for file pages. Note that this + * doesn't consider the actual amount of available memory in the system. + * The caller should further cap *@pavail accordingly. + */ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, + unsigned long *pdirty, unsigned long *pwriteback) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + struct mem_cgroup *parent; + unsigned long head_room = PAGE_COUNTER_MAX; + unsigned long file_pages; + + *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + + /* this should eventually include NR_UNSTABLE_NFS */ + *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + + file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | + (1 << LRU_ACTIVE_FILE)); + while ((parent = parent_mem_cgroup(memcg))) { + unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long used = page_counter_read(&memcg->memory); + + head_room = min(head_room, ceiling - min(ceiling, used)); + memcg = parent; + } + + *pavail = file_pages + head_room; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * DO NOT USE IN NEW FILES. * @@ -4468,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto out_free; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + spin_lock_init(&memcg->pcp_counter_lock); return memcg; +out_free_stat: + free_percpu(memcg->stat); out_free: kfree(memcg); return NULL; @@ -4497,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); + memcg_wb_domain_exit(memcg); kfree(memcg); } @@ -4529,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; + mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4547,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); +#endif return &memcg->css; free_out: @@ -4635,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) @@ -4668,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; + memcg_wb_domain_size_changed(memcg); } #ifdef CONFIG_MMU @@ -4816,6 +4836,111 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return page; } +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must confirm following. + * - page is not on LRU (isolate_page() is useful.) + * - compound_lock is held when nr_pages > 1 + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + unsigned long flags; + int ret; + bool anon; + + VM_BUG_ON(from == to); + VM_BUG_ON_PAGE(PageLRU(page), page); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + /* + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; + + ret = -EINVAL; + if (page->mem_cgroup != from) + goto out_unlock; + + anon = PageAnon(page); + + spin_lock_irqsave(&from->move_lock, flags); + + if (!anon && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } + + /* + * move_lock grabbed above and caller set from->moving_account, so + * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * So mapping should be stable for dirty pages. + */ + if (!anon && PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + } + } + + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } + + /* + * It is safe to change page->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ + + /* caller should have done css_get */ + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + + ret = 0; + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); +out: + return ret; +} + static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { @@ -5012,7 +5137,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * tunable will only affect upcoming migrations, not the current one. * So we need to save it, and keep it going. */ - move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); if (move_flags) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -5246,7 +5371,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = ACCESS_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5276,7 +5401,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long high = ACCESS_ONCE(memcg->high); + unsigned long high = READ_ONCE(memcg->high); if (high == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5300,13 +5425,14 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + memcg_wb_domain_size_changed(memcg); return nbytes; } static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = ACCESS_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.limit); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5332,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5838,9 +5965,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); - /* XXX: caller holds IRQ-safe mapping->tree_lock */ - VM_BUG_ON(!irqs_disabled()); - + /* Caller disabled preemption with mapping->tree_lock */ mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); } @@ -5861,7 +5986,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) id = swap_cgroup_record(entry, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); + memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memsw, 1); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d487f8dc6d39..ea5a93659488 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -20,6 +20,14 @@ * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back @@ -28,13 +36,6 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -56,6 +57,7 @@ #include <linux/mm_inline.h> #include <linux/kfifo.h> #include "internal.h" +#include "ras/ras_event.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -503,22 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill, kfree(tk); } -/* - * Error handlers for various types of pages. - */ - -enum outcome { - IGNORED, /* Error: cannot be handled */ - FAILED, /* Error: handling failed */ - DELAYED, /* Will be handled later */ - RECOVERED, /* Successfully recovered */ +static const char *action_name[] = { + [MF_IGNORED] = "Ignored", + [MF_FAILED] = "Failed", + [MF_DELAYED] = "Delayed", + [MF_RECOVERED] = "Recovered", }; -static const char *action_name[] = { - [IGNORED] = "Ignored", - [FAILED] = "Failed", - [DELAYED] = "Delayed", - [RECOVERED] = "Recovered", +static const char * const action_page_types[] = { + [MF_MSG_KERNEL] = "reserved kernel page", + [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MF_MSG_SLAB] = "kernel slab page", + [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MF_MSG_HUGE] = "huge page", + [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_UNMAP_FAILED] = "unmapping failed page", + [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MF_MSG_DIRTY_LRU] = "dirty LRU page", + [MF_MSG_CLEAN_LRU] = "clean LRU page", + [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MF_MSG_BUDDY] = "free buddy page", + [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MF_MSG_UNKNOWN] = "unknown page", }; /* @@ -552,7 +566,7 @@ static int delete_from_lru_cache(struct page *p) */ static int me_kernel(struct page *p, unsigned long pfn) { - return IGNORED; + return MF_IGNORED; } /* @@ -561,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); - return FAILED; + return MF_FAILED; } /* @@ -570,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn) static int me_pagecache_clean(struct page *p, unsigned long pfn) { int err; - int ret = FAILED; + int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -580,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * should be the one m_f() holds. */ if (PageAnon(p)) - return RECOVERED; + return MF_RECOVERED; /* * Now truncate the page in the page cache. This is really @@ -594,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return FAILED; + return MF_FAILED; } /* @@ -611,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { - ret = RECOVERED; + ret = MF_RECOVERED; } } else { /* @@ -619,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * This fails on dirty or anything with private pages */ if (invalidate_inode_page(p)) - ret = RECOVERED; + ret = MF_RECOVERED; else printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", pfn); @@ -705,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ClearPageUptodate(p); if (!delete_from_lru_cache(p)) - return DELAYED; + return MF_DELAYED; else - return FAILED; + return MF_FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) @@ -715,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) delete_from_swap_cache(p); if (!delete_from_lru_cache(p)) - return RECOVERED; + return MF_RECOVERED; else - return FAILED; + return MF_FAILED; } /* @@ -730,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + + if (!PageHuge(hpage)) + return MF_DELAYED; + /* * We can safely recover from error on free or reserved (i.e. * not in-use) hugepage by dequeuing it from freelist. @@ -743,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); if (!res) - return RECOVERED; + return MF_RECOVERED; } - return DELAYED; + return MF_DELAYED; } /* @@ -777,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) static struct page_state { unsigned long mask; unsigned long res; - char *msg; + enum mf_action_page_type type; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, "reserved kernel", me_kernel }, + { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -791,31 +809,31 @@ static struct page_state { * currently unused objects without touching them. But just * treat it as standard kernel for now. */ - { slab, slab, "kernel slab", me_kernel }, + { slab, slab, MF_MSG_SLAB, me_kernel }, #ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, "huge", me_huge_page }, - { tail, tail, "huge", me_huge_page }, + { head, head, MF_MSG_HUGE, me_huge_page }, + { tail, tail, MF_MSG_HUGE, me_huge_page }, #else - { compound, compound, "huge", me_huge_page }, + { compound, compound, MF_MSG_HUGE, me_huge_page }, #endif - { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, - { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, - { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, - { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, - { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, + { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, - { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, - { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, /* * Catchall entry: must be at end. */ - { 0, 0, "unknown page state", me_unknown }, + { 0, 0, MF_MSG_UNKNOWN, me_unknown }, }; #undef dirty @@ -835,10 +853,13 @@ static struct page_state { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, char *msg, int result) +static void action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { - pr_err("MCE %#lx: %s page recovery: %s\n", - pfn, msg, action_name[result]); + trace_memory_failure_event(pfn, type, result); + + pr_err("MCE %#lx: recovery action for %s: %s\n", + pfn, action_page_types[type], action_name[result]); } static int page_action(struct page_state *ps, struct page *p, @@ -850,23 +871,68 @@ static int page_action(struct page_state *ps, struct page *p, result = ps->action(p, pfn); count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == DELAYED) + if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { printk(KERN_ERR - "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, count); - result = FAILED; + "MCE %#lx: %s still referenced by %d users\n", + pfn, action_page_types[ps->type], count); + result = MF_FAILED; } - action_result(pfn, ps->msg, result); + action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; +} + +/** + * get_hwpoison_page() - Get refcount for memory error handling: + * @page: raw error page (hit by memory error) + * + * Return: return 0 if failed to grab the refcount, otherwise true (some + * non-zero value.) + */ +int get_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) + return get_page_unless_zero(head); + + /* + * Thp tail page has special refcounting rule (refcount of tail pages + * is stored in ->_mapcount,) so we can't call get_page_unless_zero() + * directly for tail pages. + */ + if (PageTransHuge(head)) { + /* + * Non anonymous thp exists only in allocation/free time. We + * can't handle such a case correctly, so let's give it up. + * This should be better than triggering BUG_ON when kernel + * tries to touch the "partially handled" page. + */ + if (!PageAnon(head)) { + pr_err("MCE: %#lx: non anonymous thp\n", + page_to_pfn(page)); + return 0; + } + + if (get_page_unless_zero(head)) { + if (PageTail(page)) + get_page(page); + return 1; + } else { + return 0; + } + } + + return get_page_unless_zero(page); } +EXPORT_SYMBOL_GPL(get_hwpoison_page); /* * Do all that is necessary to remove user space mappings. Unmap @@ -881,7 +947,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1, forcekill; struct page *hpage = *hpagep; - struct page *ppage; /* * Here we are interested only in user-mapped pages, so skip any @@ -931,59 +996,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* - * ppage: poisoned page - * if p is regular page(4k page) - * ppage == real poisoned page; - * else p is hugetlb or THP, ppage == head page. - */ - ppage = hpage; - - if (PageTransHuge(hpage)) { - /* - * Verify that this isn't a hugetlbfs head page, the check for - * PageAnon is just for avoid tripping a split_huge_page - * internal debug check, as split_huge_page refuses to deal with - * anything that isn't an anon page. PageAnon can't go away fro - * under us because we hold a refcount on the hpage, without a - * refcount on the hpage. split_huge_page can't be safely called - * in the first place, having a refcount on the tail isn't - * enough * to be safe. - */ - if (!PageHuge(hpage) && PageAnon(hpage)) { - if (unlikely(split_huge_page(hpage))) { - /* - * FIXME: if splitting THP is failed, it is - * better to stop the following operation rather - * than causing panic by unmapping. System might - * survive if the page is freed later. - */ - printk(KERN_INFO - "MCE %#lx: failed to split THP\n", pfn); - - BUG_ON(!PageHWPoison(p)); - return SWAP_FAIL; - } - /* - * We pinned the head page for hwpoison handling, - * now we split the thp and we are interested in - * the hwpoisoned raw page, so move the refcount - * to it. Similarly, page lock is shifted. - */ - if (hpage != p) { - if (!(flags & MF_COUNT_INCREASED)) { - put_page(hpage); - get_page(p); - } - lock_page(p); - unlock_page(hpage); - *hpagep = p; - } - /* THP is split, so ppage should be the real poisoned page. */ - ppage = p; - } - } - - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -992,12 +1004,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(ppage, ttu); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(ppage)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1009,7 +1021,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); @@ -1055,6 +1067,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page_state *ps; struct page *p; struct page *hpage; + struct page *orig_head; int res; unsigned int nr_pages; unsigned long page_flags; @@ -1070,7 +1083,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - hpage = compound_head(p); + orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; @@ -1103,10 +1116,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(hpage)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy", DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; } else if (PageHuge(hpage)) { /* @@ -1123,16 +1135,30 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, "free huge", - res ? IGNORED : DELAYED); + action_result(pfn, MF_MSG_FREE_HUGE, + res ? MF_IGNORED : MF_DELAYED); unlock_page(hpage); return res; } else { - action_result(pfn, "high order kernel", IGNORED); + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; } } + if (!PageHuge(p) && PageTransHuge(hpage)) { + if (unlikely(split_huge_page(hpage))) { + pr_err("MCE: %#lx: thp split failed\n", pfn); + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(p); + if (p != hpage) + put_page(hpage); + return -EBUSY; + } + VM_BUG_ON_PAGE(!page_count(p), p); + hpage = compound_head(p); + } + /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others @@ -1141,7 +1167,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageHuge(p) && !PageTransTail(p)) { + if (!PageHuge(p)) { if (!PageLRU(p)) shake_page(p, 0); if (!PageLRU(p)) { @@ -1150,9 +1176,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (is_free_buddy_page(p)) { if (flags & MF_COUNT_INCREASED) - action_result(pfn, "free buddy", DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); else - action_result(pfn, "free buddy, 2nd try", DELAYED); + action_result(pfn, MF_MSG_BUDDY_2ND, + MF_DELAYED); return 0; } } @@ -1164,8 +1191,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The page could have changed compound pages during the locking. * If this happens just bail out. */ - if (compound_head(p) != hpage) { - action_result(pfn, "different compound page after locking", IGNORED); + if (PageCompound(p) && compound_head(p) != orig_head) { + action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto out; } @@ -1185,9 +1212,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); put_page(hpage); - res = 0; - goto out; + return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) @@ -1205,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * on the head page to show that the hugepage is hwpoisoned */ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, "hugepage already hardware poisoned", - IGNORED); + action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); unlock_page(hpage); put_page(hpage); return 0; @@ -1235,7 +1261,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - action_result(pfn, "unmapping failed", IGNORED); + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; } @@ -1244,7 +1270,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, "already truncated LRU", IGNORED); + action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; goto out; } @@ -1404,12 +1430,12 @@ int unpoison_memory(unsigned long pfn) */ if (!PageHuge(page) && PageTransHuge(page)) { pr_info("MCE: Memory failure is now running on %#lx\n", pfn); - return 0; + return 0; } nr_pages = 1 << compound_order(page); - if (!get_page_unless_zero(page)) { + if (!get_hwpoison_page(p)) { /* * Since HWPoisoned hugepage should have non-zero refcount, * race between memory failure and unpoison seems to happen. @@ -1477,7 +1503,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) * When the target page is a free hugepage, just remove it * from free hugepage list. */ - if (!get_page_unless_zero(compound_head(p))) { + if (!get_hwpoison_page(p)) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = 0; @@ -1540,8 +1566,18 @@ static int soft_offline_huge_page(struct page *page, int flags) } unlock_page(hpage); - /* Keep page count to indicate a given hugepage is isolated. */ - list_move(&hpage->lru, &pagelist); + ret = isolate_huge_page(hpage, &pagelist); + if (ret) { + /* + * get_any_page() and isolate_huge_page() takes a refcount each, + * so need to drop one here. + */ + put_page(hpage); + } else { + pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); + return -EBUSY; + } + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { @@ -1623,6 +1659,8 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); + if (!TestSetPageHWPoison(page)) + atomic_long_inc(&num_poisoned_pages); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { @@ -1637,22 +1675,8 @@ static int __soft_offline_page(struct page *page, int flags) pfn, ret, page->flags); if (ret > 0) ret = -EIO; - } else { - /* - * After page migration succeeds, the source page can - * be trapped in pagevec and actual freeing is delayed. - * Freeing code works differently based on PG_hwpoison, - * so there's a race. We need to make sure that the - * source page should be freed back to buddy before - * setting PG_hwpoison. - */ - if (!is_free_buddy_page(page)) - drain_all_pages(page_zone(page)); - SetPageHWPoison(page); - if (!is_free_buddy_page(page)) - pr_info("soft offline: %#lx: page leaked\n", - pfn); - atomic_long_inc(&num_poisoned_pages); + if (TestClearPageHWPoison(page)) + atomic_long_dec(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", @@ -1703,14 +1727,6 @@ int soft_offline_page(struct page *page, int flags) get_online_mems(); - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - set_migratetype_isolate(page, true); - ret = get_any_page(page, pfn, flags); put_online_mems(); if (ret > 0) { /* for in-use pages */ @@ -1721,14 +1737,13 @@ int soft_offline_page(struct page *page, int flags) } else if (ret == 0) { /* for free pages */ if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), + if (!dequeue_hwpoisoned_huge_page(hpage)) + atomic_long_add(1 << compound_order(hpage), &num_poisoned_pages); } else { - SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + if (!TestSetPageHWPoison(page)) + atomic_long_inc(&num_poisoned_pages); } } - unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 97839f5c8c30..388dcf9aa283 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ - if (vma->vm_ops) - printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", - vma->vm_ops->fault); - if (vma->vm_file) - printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", - vma->vm_file->f_op->mmap); + pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", + vma->vm_file, + vma->vm_ops ? vma->vm_ops->fault : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + mapping ? mapping->a_ops->readpage : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } @@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. - * - * Note that this routine assumes that the protection checks have been - * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. + * Handle write page faults for pages that can be reused in the current vma * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * This can happen either due to the mapping being with the VM_SHARED flag, + * or due to us being the last reference standing to the page. In either + * case, all we need to do here is to mark the page as writable and update + * any related book-keeping. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - spinlock_t *ptl, pte_t orig_pte) +static inline int wp_page_reuse(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + struct page *page, int page_mkwrite, + int dirty_shared) __releases(ptl) { - struct page *old_page, *new_page = NULL; pte_t entry; - int ret = 0; - int page_mkwrite = 0; - bool dirty_shared = false; - unsigned long mmun_start = 0; /* For mmu_notifiers */ - unsigned long mmun_end = 0; /* For mmu_notifiers */ - struct mem_cgroup *memcg; - - old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) { - /* - * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a - * VM_PFNMAP VMA. - * - * We should not cow pages in a shared writeable mapping. - * Just mark the pages writable as we can't do any dirty - * accounting on raw pfn maps. - */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) - goto reuse; - goto gotten; - } - /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { - if (!trylock_page(old_page)) { - page_cache_get(old_page); - pte_unmap_unlock(page_table, ptl); - lock_page(old_page); - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - goto unlock; - } - page_cache_release(old_page); - } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); - unlock_page(old_page); - goto reuse; - } - unlock_page(old_page); - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - page_cache_get(old_page); - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - int tmp; - - pte_unmap_unlock(page_table, ptl); - tmp = do_page_mkwrite(vma, old_page, address); - if (unlikely(!tmp || (tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(old_page); - return tmp; - } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - goto unlock; - } - page_mkwrite = 1; - } - - dirty_shared = true; - -reuse: - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (old_page) - page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); - - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, page_table); - pte_unmap_unlock(page_table, ptl); - ret |= VM_FAULT_WRITE; + if (page) + page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - if (dirty_shared) { - struct address_space *mapping; - int dirtied; + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, address, page_table, entry, 1)) + update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); - if (!page_mkwrite) - lock_page(old_page); + if (dirty_shared) { + struct address_space *mapping; + int dirtied; - dirtied = set_page_dirty(old_page); - VM_BUG_ON_PAGE(PageAnon(old_page), old_page); - mapping = old_page->mapping; - unlock_page(old_page); - page_cache_release(old_page); + if (!page_mkwrite) + lock_page(page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); - if (!page_mkwrite) - file_update_time(vma->vm_file); + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); } - return ret; + if (!page_mkwrite) + file_update_time(vma->vm_file); } - /* - * Ok, we need to copy. Oh, well.. - */ - page_cache_get(old_page); -gotten: - pte_unmap_unlock(page_table, ptl); + return VM_FAULT_WRITE; +} + +/* + * Handle the case of a page which we actually need to copy to a new page. + * + * Called with mmap_sem locked and the old page referenced, but + * without the ptl held. + * + * High level logic flow: + * + * - Allocate a page, copy the content of the old page to the new one. + * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. + * - Take the PTL. If the pte changed, bail out and release the allocated page + * - If the pte is still the way we remember it, update the page table and all + * relevant references. This includes dropping the reference the page-table + * held to the old page, as well as updating the rmap. + * - In any case, unlock the PTL and drop the reference we took to the old page. + */ +static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + pte_t orig_pte, struct page *old_page) +{ + struct page *new_page = NULL; + spinlock_t *ptl = NULL; + pte_t entry; + int page_copied = 0; + const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ + const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ + struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2158,13 +2081,12 @@ gotten: goto oom; cow_user_page(new_page, old_page, address, vma); } - __SetPageUptodate(new_page); if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; - mmun_start = address & PAGE_MASK; - mmun_end = mmun_start + PAGE_SIZE; + __SetPageUptodate(new_page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* @@ -2177,8 +2099,9 @@ gotten: dec_mm_counter_fast(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES); } - } else + } else { inc_mm_counter_fast(mm, MM_ANONPAGES); + } flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2227,29 +2150,29 @@ gotten: /* Free the old page.. */ new_page = old_page; - ret |= VM_FAULT_WRITE; - } else + page_copied = 1; + } else { mem_cgroup_cancel_charge(new_page, memcg); + } if (new_page) page_cache_release(new_page); -unlock: + pte_unmap_unlock(page_table, ptl); - if (mmun_end > mmun_start) - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ - if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); } - return ret; + return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: page_cache_release(new_page); oom: @@ -2258,6 +2181,179 @@ oom: return VM_FAULT_OOM; } +/* + * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED + * mapping + */ +static int wp_pfn_shared(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + pmd_t *pmd) +{ + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { + struct vm_fault vmf = { + .page = NULL, + .pgoff = linear_page_index(vma, address), + .virtual_address = (void __user *)(address & PAGE_MASK), + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, + }; + int ret; + + pte_unmap_unlock(page_table, ptl); + ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); + if (ret & VM_FAULT_ERROR) + return ret; + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * We might have raced with another page fault while we + * released the pte_offset_map_lock. + */ + if (!pte_same(*page_table, orig_pte)) { + pte_unmap_unlock(page_table, ptl); + return 0; + } + } + return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, + NULL, 0, 0); +} + +static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, + pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, + struct page *old_page) + __releases(ptl) +{ + int page_mkwrite = 0; + + page_cache_get(old_page); + + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + int tmp; + + pte_unmap_unlock(page_table, ptl); + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; + } + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_mkwrite = 1; + } + + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, page_mkwrite, 1); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) +{ + struct page *old_page; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable and/or call ops->pfn_mkwrite. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + return wp_pfn_shared(mm, vma, address, page_table, ptl, + orig_pte, pmd); + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); + } + + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page) && !PageKsm(old_page)) { + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_cache_release(old_page); + } + if (reuse_swap_page(old_page)) { + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); + } + unlock_page(old_page); + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + return wp_page_shared(mm, vma, address, page_table, pmd, + ptl, orig_pte, old_page); + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); +} + static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) @@ -2574,6 +2670,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap(page_table); + /* File mapping without ->vm_ops ? */ + if (vma->vm_flags & VM_SHARED) + return VM_FAULT_SIGBUS; + /* Check if we need to add a guard page to the stack */ if (check_stack_guard_page(vma, address) < 0) return VM_FAULT_SIGSEGV; @@ -2594,6 +2694,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + goto oom_free_page; + /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before @@ -2601,9 +2705,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) - goto oom_free_page; - entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -2784,7 +2885,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, struct vm_fault vmf; int off; - nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; start_addr = max(address & mask, vma->vm_start); @@ -3002,6 +3103,9 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); + /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ + if (!vma->vm_ops->fault) + return VM_FAULT_SIGBUS; if (!(flags & FAULT_FLAG_WRITE)) return do_read_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); @@ -3147,13 +3251,12 @@ static int handle_pte_fault(struct mm_struct *mm, barrier(); if (!pte_present(entry)) { if (pte_none(entry)) { - if (vma->vm_ops) { - if (likely(vma->vm_ops->fault)) - return do_fault(mm, vma, address, pte, - pmd, flags, entry); - } - return do_anonymous_page(mm, vma, address, - pte, pmd, flags); + if (vma->vm_ops) + return do_fault(mm, vma, address, pte, pmd, + flags, entry); + + return do_anonymous_page(mm, vma, address, pte, pmd, + flags); } return do_swap_page(mm, vma, address, pte, pmd, flags, entry); @@ -3629,7 +3732,7 @@ void print_vma_addr(char *prefix, unsigned long ip) if (buf) { char *p; - p = d_path(&f->f_path, buf, PAGE_SIZE); + p = file_path(f, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; printk("%s%s[%lx+%lx]", prefix, kbasename(p), @@ -3642,7 +3745,7 @@ void print_vma_addr(char *prefix, unsigned long ip) } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) -void might_fault(void) +void __might_fault(const char *file, int line) { /* * Some code (nfs/sunrpc) uses socket ops on kernel memory while @@ -3652,21 +3755,15 @@ void might_fault(void) */ if (segment_eq(get_fs(), KERNEL_DS)) return; - - /* - * it would be nicer only to annotate paths which are not under - * pagefault_disable, however that requires a larger audit and - * providing helpers like get_user_atomic. - */ - if (in_atomic()) + if (pagefault_disabled()) return; - - __might_sleep(__FILE__, __LINE__, 0); - + __might_sleep(file, line, 0); +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_sem); +#endif } -EXPORT_SYMBOL(might_fault); +EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 65842d688b7c..003dbe4b060d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -104,7 +104,7 @@ void put_online_mems(void) } -static void mem_hotplug_begin(void) +void mem_hotplug_begin(void) { mem_hotplug.active_writer = current; @@ -119,7 +119,7 @@ static void mem_hotplug_begin(void) } } -static void mem_hotplug_done(void) +void mem_hotplug_done(void) { mem_hotplug.active_writer = NULL; mutex_unlock(&mem_hotplug.lock); @@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int nid = pgdat->node_id; int zone_type; - unsigned long flags; + unsigned long flags, pfn; int ret; zone_type = zone - pgdat->node_zones; @@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn, MEMMAP_HOTPLUG); + + /* online_page_range is called later and expects pages reserved */ + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + if (!pfn_valid(pfn)) + continue; + + SetPageReserved(pfn_to_page(pfn)); + } return 0; } @@ -502,7 +510,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); + err = __add_section(nid, zone, section_nr_to_pfn(i)); /* * EEXIST is finally dealt with by ioresource collision @@ -513,6 +521,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, break; err = 0; } + vmemmap_populate_print_last(); return err; } @@ -959,6 +968,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) } +/* Must be protected by mem_hotplug_begin() */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -969,7 +979,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -977,21 +986,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); - ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) - goto out; + return -EINVAL; if (online_type == MMOP_ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } if (online_type == MMOP_ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } /* Previous code may changed the zone of the pfn range */ @@ -1007,7 +1015,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } /* * If this zone is not populated, then it is not in zonelist. @@ -1031,7 +1039,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } zone->present_pages += onlined_pages; @@ -1061,9 +1069,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); -out: - mem_hotplug_done(); - return ret; + return 0; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1376,7 +1382,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (PageLRU(page)) return pfn; if (PageHuge(page)) { - if (is_hugepage_active(page)) + if (page_huge_active(page)) return pfn; else pfn = round_up(pfn + 1, @@ -1688,21 +1694,18 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - mem_hotplug_begin(); - zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - ret = -EINVAL; if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - goto out; + return -EINVAL; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); if (ret) - goto out; + return ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1795,7 +1798,6 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - mem_hotplug_done(); return 0; failed_removal: @@ -1805,12 +1807,10 @@ failed_removal: memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - -out: - mem_hotplug_done(); return ret; } +/* Must be protected by mem_hotplug_begin() */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); @@ -1978,8 +1978,10 @@ void try_offline_node(int nid) * wait_table may be allocated from boot memory, * here only free if it's allocated by vmalloc. */ - if (is_vmalloc_addr(zone->wait_table)) + if (is_vmalloc_addr(zone->wait_table)) { vfree(zone->wait_table); + zone->wait_table = NULL; + } } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4721046a134a..99d4c1d0b858 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x return alloc_huge_page_node(page_hstate(compound_head(page)), node); else - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE, 0); } /* @@ -1971,34 +1972,41 @@ retry_cpuset: pol = get_vma_policy(vma, addr); cpuset_mems_cookie = read_mems_allowed_begin(); - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && - pol->mode != MPOL_INTERLEAVE)) { + if (pol->mode == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + page = alloc_page_interleave(gfp, order, nid); + goto out; + } + + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + /* * For hugepage allocation and non-interleave policy which - * allows the current node, we only try to allocate from the - * current node and don't fall back to other nodes, as the - * cost of remote accesses would likely offset THP benefits. + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. * * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ + if (pol->mode == MPOL_PREFERRED && + !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(node, *nmask)) { + if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(node, gfp, order); + page = alloc_pages_exact_node(hpage_node, + gfp | __GFP_THISNODE, order); goto out; } } - if (pol->mode == MPOL_INTERLEAVE) { - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - page = alloc_page_interleave(gfp, order, nid); - goto out; - } - nmask = policy_nodemask(gfp, pol); zl = policy_zonelist(gfp, pol, node); mpol_cond_put(pol); @@ -2516,7 +2524,7 @@ static void __init check_numabalancing_enable(void) if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); - if (nr_node_ids > 1 && !numabalancing_override) { + if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. " "Configure with numa_balancing= or the " "kernel.numa_balancing sysctl", diff --git a/mm/mempool.c b/mm/mempool.c index e209c98c7203..2cc08de8b1db 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -6,26 +6,138 @@ * extreme VM load. * * started by Ingo Molnar, Copyright (C) 2001 + * debugging by David Rientjes, Copyright (C) 2015 */ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/blkdev.h> #include <linux/writeback.h> +#include "slab.h" + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +static void poison_error(mempool_t *pool, void *element, size_t size, + size_t byte) +{ + const int nr = pool->curr_nr; + const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); + const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); + int i; + + pr_err("BUG: mempool element poison mismatch\n"); + pr_err("Mempool %p size %zu\n", pool, size); + pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); + for (i = start; i < end; i++) + pr_cont("%x ", *(u8 *)(element + i)); + pr_cont("%s\n", end < size ? "..." : ""); + dump_stack(); +} + +static void __check_element(mempool_t *pool, void *element, size_t size) +{ + u8 *obj = element; + size_t i; + + for (i = 0; i < size; i++) { + u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; + + if (obj[i] != exp) { + poison_error(pool, element, size, i); + return; + } + } + memset(obj, POISON_INUSE, size); +} + +static void check_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) + __check_element(pool, element, ksize(element)); + + /* Mempools backed by page allocator */ + if (pool->free == mempool_free_pages) { + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} + +static void __poison_element(void *element, size_t size) +{ + u8 *obj = element; + + memset(obj, POISON_FREE, size - 1); + obj[size - 1] = POISON_END; +} + +static void poison_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + __poison_element(element, ksize(element)); + + /* Mempools backed by page allocator */ + if (pool->alloc == mempool_alloc_pages) { + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __poison_element(addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} +#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static inline void check_element(mempool_t *pool, void *element) +{ +} +static inline void poison_element(mempool_t *pool, void *element) +{ +} +#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ + +static void kasan_poison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_free(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_kfree(element); + if (pool->alloc == mempool_alloc_pages) + kasan_free_pages(element, (unsigned long)pool->pool_data); +} + +static void kasan_unpoison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_alloc(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_krealloc(element, (size_t)pool->pool_data); + if (pool->alloc == mempool_alloc_pages) + kasan_alloc_pages(element, (unsigned long)pool->pool_data); +} static void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); + poison_element(pool, element); + kasan_poison_element(pool, element); pool->elements[pool->curr_nr++] = element; } static void *remove_element(mempool_t *pool) { - BUG_ON(pool->curr_nr <= 0); - return pool->elements[--pool->curr_nr]; + void *element = pool->elements[--pool->curr_nr]; + + BUG_ON(pool->curr_nr < 0); + check_element(pool, element); + kasan_unpoison_element(pool, element); + return element; } /** @@ -113,23 +225,24 @@ EXPORT_SYMBOL(mempool_create_node); * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. - * @gfp_mask: the usual allocation bitmask. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. */ -int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) +int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); + might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -145,7 +258,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ - new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); if (!new_elements) return -ENOMEM; @@ -164,7 +278,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); - element = pool->alloc(gfp_mask, pool->pool_data); + element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); @@ -332,6 +446,7 @@ EXPORT_SYMBOL(mempool_free); void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; + VM_BUG_ON(mem->ctor); return kmem_cache_alloc(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); diff --git a/mm/memtest.c b/mm/memtest.c new file mode 100644 index 000000000000..0a1cc133f6d7 --- /dev/null +++ b/mm/memtest.c @@ -0,0 +1,119 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/memblock.h> + +static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ +}; + +static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) +{ + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); + memblock_reserve(start_bad, end_bad - start_bad); +} + +static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) +{ + u64 *p, *start, *end; + phys_addr_t start_bad, last_bad; + phys_addr_t start_phys_aligned; + const size_t incr = sizeof(pattern); + + start_phys_aligned = ALIGN(start_phys, incr); + start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; + start_bad = 0; + last_bad = 0; + + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); +} + +static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) +{ + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start, + &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } + } +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(phys_addr_t start, phys_addr_t end) +{ + unsigned int i; + unsigned int idx = 0; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + for (i = memtest_pattern-1; i < UINT_MAX; --i) { + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } +} diff --git a/mm/migrate.c b/mm/migrate.c index 85e042686031..eb4267107d1f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Please do not reorder this without considering how mm/ksm.c's * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). */ - ClearPageSwapCache(page); + if (PageSwapCache(page)) + ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -879,7 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes or remove ptes */ if (page_mapped(page)) { try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| + TTU_IGNORE_HWPOISON); page_was_mapped = 1; } @@ -901,12 +903,24 @@ out: } /* + * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work + * around it. + */ +#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) +#define ICE_noinline noinline +#else +#define ICE_noinline +#endif + +/* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct page *page, int force, - enum migrate_mode mode) +static ICE_noinline int unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, + unsigned long private, struct page *page, + int force, enum migrate_mode mode, + enum migrate_reason reason) { int rc = 0; int *result = NULL; @@ -937,7 +951,11 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + /* Soft-offlined page shouldn't go through lru cache list */ + if (reason == MR_MEMORY_FAILURE) + put_page(page); + else + putback_lru_page(page); } /* @@ -1110,7 +1128,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, pass > 2, mode); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode); + private, page, pass > 2, mode, + reason); switch(rc) { case -ENOMEM: @@ -1554,30 +1573,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. - * If a node is rate-limited then PTE NUMA updates are also rate-limited. However - * as it is faults that reset the window, pte updates will happen unconditionally - * if there has not been a fault since @pteupdate_interval_millisecs after the - * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); -/* Returns true if NUMA migration is currently rate limited */ -bool migrate_ratelimited(int node) -{ - pg_data_t *pgdat = NODE_DATA(node); - - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + - msecs_to_jiffies(pteupdate_interval_millisecs))) - return false; - - if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) - return false; - - return true; -} - /* Returns true if the node is migrate rate-limited after the update */ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) @@ -1804,7 +1803,7 @@ fail_putback: */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start); - pmdp_clear_flush_notify(vma, mmun_start, pmd); + pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); diff --git a/mm/mlock.c b/mm/mlock.c index 8a54cd214925..6fd2cf15e868 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,62 +205,6 @@ out: return nr_pages - 1; } -/** - * __mlock_vma_pages_range() - mlock a range of pages in the vma. - * @vma: target vma - * @start: start address - * @end: end address - * @nonblocking: - * - * This takes care of making the pages present too. - * - * return 0 on success, negative error code on error. - * - * vma->vm_mm->mmap_sem must be held. - * - * If @nonblocking is NULL, it may be held for read or write and will - * be unperturbed. - * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. - */ -long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long nr_pages = (end - start) / PAGE_SIZE; - int gup_flags; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - - gup_flags = FOLL_TOUCH | FOLL_MLOCK; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) - gup_flags |= FOLL_WRITE; - - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. - */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) - gup_flags |= FOLL_FORCE; - - /* - * We made sure addr is within a VMA, so the following will - * not result in a stack expansion that recurses back here. - */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); -} - /* * convert get_user_pages() return value to posix mlock() error */ @@ -596,7 +540,7 @@ success: /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, __mlock_vma_pages_range will bring it back. + * set VM_LOCKED, populate_vma_page_range will bring it back. */ if (lock) @@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. - */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) -{ - struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; - struct vm_area_struct *vma = NULL; - int locked = 0; - long ret = 0; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(len != PAGE_ALIGN(len)); - end = start + len; - - for (nstart = start; nstart < end; nstart = nend) { - /* - * We want to fault in pages for [nstart; end) address range. - * Find first corresponding VMA. - */ - if (!locked) { - locked = 1; - down_read(&mm->mmap_sem); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - /* - * Set [nstart; nend) to intersection of desired address - * range with the first VMA. Also, skip undesirable VMA types. - */ - nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - /* - * Now fault in a range of pages. __mlock_vma_pages_range() - * double checks the vma flags, so that it won't mlock pages - * if the vma was already munlocked. - */ - ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); - if (ret < 0) { - if (ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - ret = __mlock_posix_error_return(ret); - break; - } - nend = nstart + ret * PAGE_SIZE; - ret = 0; - } - if (locked) - up_read(&mm->mmap_sem); - return ret; /* 0 or negative error code */ -} - SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); - if (!error) - error = __mm_populate(start, len, 0); - return error; + if (error) + return error; + + error = __mm_populate(start, len, 0); + if (error) + return __mlock_posix_error_return(error); + return 0; } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) diff --git a/mm/mm_init.c b/mm/mm_init.c index 5f420f7fafa1..fdadf918de76 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/memory.h> #include <linux/notifier.h> +#include <linux/sched.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void) BUG_ON(or_mask != add_mask); } -void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, - unsigned long nid, unsigned long pfn) -{ - BUG_ON(page_to_nid(page) != nid); - BUG_ON(page_zonenum(page) != zone); - BUG_ON(page_to_pfn(page) != pfn); -} - static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/mm/mmap.c b/mm/mmap.c index 9ec50a368634..aa632ade2be7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But - * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. + * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid @@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { - struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; @@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, *populate = 0; + if (!len) + return -EINVAL; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) prot |= PROT_EXEC; - if (!len) - return -EINVAL; - if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ error = -ENOMEM; -munmap_back: - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { + while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, + &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; - goto munmap_back; } /* @@ -1571,7 +1570,8 @@ munmap_back: /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, + NULL); if (vma) goto out; @@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns actual_size = size; if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) actual_size -= PAGE_SIZE; - if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; @@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) - __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); + populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else @@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) - __mlock_vma_pages_range(vma, addr, start, NULL); + populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif @@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* * Clear old maps. this also does some error checking for us */ - munmap_back: - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { + while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, + &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; - goto munmap_back; } /* Check against address space limits *after* clearing old maps... */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 88584838e704..e7d6f1171ecb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -322,6 +324,15 @@ success: change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + /* + * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major + * fault on access. + */ + if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && + (newflags & VM_WRITE)) { + populate_vma_page_range(vma, start, end, NULL); + } + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); diff --git a/mm/mremap.c b/mm/mremap.c index 57dadc025c64..a7c93eceb1c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include <linux/mmu_notifier.h> #include <linux/sched/sysctl.h> #include <linux/uaccess.h> +#include <linux/mm-arch-hooks.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -286,8 +287,18 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) - vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + } else { + if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, + old_addr, moved_len, true); + return err; + } + } + arch_remap(mm, old_addr, old_addr + old_len, + new_addr, new_addr + new_len); + } /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT) { @@ -339,25 +350,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, struct vm_area_struct *vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) - goto Efault; + return ERR_PTR(-EFAULT); if (is_vm_hugetlb_page(vma)) - goto Einval; + return ERR_PTR(-EINVAL); /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) - goto Efault; + return ERR_PTR(-EFAULT); /* Need to be careful about a growing mapping */ if (new_len > old_len) { unsigned long pgoff; if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - goto Efault; + return ERR_PTR(-EFAULT); pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - goto Einval; + return ERR_PTR(-EINVAL); } if (vma->vm_flags & VM_LOCKED) { @@ -366,29 +377,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK); locked += new_len - old_len; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - goto Eagain; + return ERR_PTR(-EAGAIN); } if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) - goto Enomem; + return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) - goto Efault; + return ERR_PTR(-ENOMEM); *p = charged; } return vma; - -Efault: /* very odd choice for most of the cases, but... */ - return ERR_PTR(-EFAULT); -Einval: - return ERR_PTR(-EINVAL); -Enomem: - return ERR_PTR(-ENOMEM); -Eagain: - return ERR_PTR(-EAGAIN); } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 90b50468333e..e57cf24babd6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; + ulong flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(size, align, goal, limit, nid); +again: + addr = memblock_find_in_range_node(size, align, goal, limit, nid, + flags); + if (!addr && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } if (!addr) return NULL; @@ -77,7 +86,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -92,7 +101,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order); } @@ -121,7 +130,11 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) count += __free_memory_core(start, end); #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK diff --git a/mm/nommu.c b/mm/nommu.c index 3fba2dc97c44..58ea3643b9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -42,22 +42,6 @@ #include <asm/mmu_context.h> #include "internal.h" -#if 0 -#define kenter(FMT, ...) \ - printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) \ - printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) \ - printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) -#else -#define kenter(FMT, ...) \ - no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) \ - no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) \ - no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) -#endif - void *high_memory; EXPORT_SYMBOL(high_memory); struct page *mem_map; @@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to) for (; from < to; from += PAGE_SIZE) { struct page *page = virt_to_page(from); - kdebug("- free %lx", from); atomic_long_dec(&mmap_pages_allocated); - if (page_count(page) != 1) - kdebug("free page %p: refcount not one: %d", - page, page_count(page)); put_page(page); } } @@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, region->vm_usage); - BUG_ON(!nommu_region_tree.rb_node); if (--region->vm_usage == 0) { @@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region) /* IO memory and memory shared directly out of the pagecache * from ramfs/tmpfs mustn't be released here */ - if (region->vm_flags & VM_MAPPED_COPY) { - kdebug("free series"); + if (region->vm_flags & VM_MAPPED_COPY) free_page_series(region->vm_start, region->vm_top); - } kmem_cache_free(vm_region_jar, region); } else { up_write(&nommu_region_sem); @@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) struct address_space *mapping; struct rb_node **p, *parent, *rb_prev; - kenter(",%p", vma); - BUG_ON(!vma->vm_region); mm->map_count++; @@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; - kenter("%p", vma); - protect_vma(vma, 0); mm->map_count--; @@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) */ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED) { - printk(KERN_DEBUG - "%d: Can't do fixed-address/overlay mmap of RAM\n", - current->pid); + if (flags & MAP_FIXED) return -EINVAL; - } if ((flags & MAP_TYPE) != MAP_PRIVATE && (flags & MAP_TYPE) != MAP_SHARED) @@ -1016,7 +983,7 @@ static int validate_mmap_request(struct file *file, * device */ if (!file->f_op->get_unmapped_area) capabilities &= ~NOMMU_MAP_DIRECT; - if (!file->f_op->read) + if (!(file->f_mode & FMODE_CAN_READ)) capabilities &= ~NOMMU_MAP_COPY; /* The file shall have been opened with read permission. */ @@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file, ) { capabilities &= ~NOMMU_MAP_DIRECT; if (flags & MAP_SHARED) { - printk(KERN_WARNING - "MAP_SHARED not completely supported on !MMU\n"); + pr_warn("MAP_SHARED not completely supported on !MMU\n"); return -EINVAL; } } @@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma, * we're allocating is smaller than a page */ order = get_order(len); - kdebug("alloc order %d for %lx", order, len); - total = 1 << order; point = len >> PAGE_SHIFT; /* we don't want to allocate a power-of-2 sized page set */ - if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) total = point; - kdebug("try to alloc exact %lu pages", total); - } base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); if (!base) @@ -1240,7 +1202,7 @@ static int do_mmap_private(struct vm_area_struct *vma, old_fs = get_fs(); set_fs(KERNEL_DS); - ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + ret = __vfs_read(vma->vm_file, base, len, &fpos); set_fs(old_fs); if (ret < 0) @@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long capabilities, vm_flags, result; int ret; - kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - *populate = 0; /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) { - kleave(" = %d [val]", ret); + if (ret < 0) return ret; - } /* we ignore the address hint */ addr = 0; @@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_start = start; vma->vm_end = start + len; - if (pregion->vm_flags & VM_MAPPED_COPY) { - kdebug("share copy"); + if (pregion->vm_flags & VM_MAPPED_COPY) vma->vm_flags |= VM_MAPPED_COPY; - } else { - kdebug("share mmap"); + else { ret = do_mmap_shared_file(vma); if (ret < 0) { vma->vm_region = NULL; @@ -1467,7 +1423,6 @@ share: up_write(&nommu_region_sem); - kleave(" = %lx", result); return result; error_just_free: @@ -1479,27 +1434,24 @@ error: if (vma->vm_file) fput(vma->vm_file); kmem_cache_free(vm_area_cachep, vma); - kleave(" = %d", ret); return ret; sharing_violation: up_write(&nommu_region_sem); - printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; error_getting_vma: kmem_cache_free(vm_region_jar, region); - printk(KERN_WARNING "Allocation of vma for %lu byte allocation" - " from process %d failed\n", - len, current->pid); + pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", + len, current->pid); show_free_areas(0); return -ENOMEM; error_getting_region: - printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" - " from process %d failed\n", - len, current->pid); + pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", + len, current->pid); show_free_areas(0); return -ENOMEM; } @@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_region *region; unsigned long npages; - kenter(""); - /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) @@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm, { struct vm_region *region; - kenter(""); - /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ delete_vma_from_mm(vma); @@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unsigned long end; int ret; - kenter(",%lx,%zx", start, len); - len = PAGE_ALIGN(len); if (len == 0) return -EINVAL; @@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (!vma) { static int limit; if (limit < 5) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d" - " (%s): 0x%lx-0x%lx\n", - current->pid, current->comm, - start, start + len - 1); + pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); limit++; } return -EINVAL; @@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* we're allowed to split an anonymous VMA but not a file-backed one */ if (vma->vm_file) { do { - if (start > vma->vm_start) { - kleave(" = -EINVAL [miss]"); + if (start > vma->vm_start) return -EINVAL; - } if (end == vma->vm_end) goto erase_whole_vma; vma = vma->vm_next; } while (vma); - kleave(" = -EINVAL [split file]"); return -EINVAL; } else { /* the chunk must be a subset of the VMA found */ if (start == vma->vm_start && end == vma->vm_end) goto erase_whole_vma; - if (start < vma->vm_start || end > vma->vm_end) { - kleave(" = -EINVAL [superset]"); + if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - } - if (start & ~PAGE_MASK) { - kleave(" = -EINVAL [unaligned start]"); + if (start & ~PAGE_MASK) return -EINVAL; - } - if (end != vma->vm_end && end & ~PAGE_MASK) { - kleave(" = -EINVAL [unaligned split]"); + if (end != vma->vm_end && end & ~PAGE_MASK) return -EINVAL; - } if (start != vma->vm_start && end != vma->vm_end) { ret = split_vma(mm, vma, start, 1); - if (ret < 0) { - kleave(" = %d [split]", ret); + if (ret < 0) return ret; - } } return shrink_vma(mm, vma, start, end); } @@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) erase_whole_vma: delete_vma_from_mm(vma); delete_vma(mm, vma); - kleave(" = 0"); return 0; } EXPORT_SYMBOL(do_munmap); @@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm) if (!mm) return; - kenter(""); - mm->total_vm = 0; while ((vma = mm->mmap)) { @@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm) delete_vma(mm, vma); cond_resched(); } - - kleave(""); } unsigned long vm_brk(unsigned long addr, unsigned long len) @@ -2157,7 +2085,7 @@ static int __meminit init_user_reserve(void) sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; } -module_init(init_user_reserve) +subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. @@ -2178,4 +2106,4 @@ static int __meminit init_admin_reserve(void) sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; } -module_init(init_admin_reserve) +subsys_initcall(init_admin_reserve); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 642f38cb175a..dff991e0681e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -42,7 +42,8 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; -static DEFINE_SPINLOCK(zone_scan_lock); + +DEFINE_MUTEX(oom_lock); #ifdef CONFIG_NUMA /** @@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; -static DECLARE_RWSEM(oom_sem); /** - * mark_tsk_oom_victim - marks the given taks as OOM victim. + * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark * - * Has to be called with oom_sem taken for read and never after + * Has to be called with oom_lock held and never after * oom has been disabled already. */ -void mark_tsk_oom_victim(struct task_struct *tsk) +void mark_oom_victim(struct task_struct *tsk) { WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ @@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk) } /** - * unmark_oom_victim - unmarks the current task as OOM victim. - * - * Wakes up all waiters in oom_killer_disable() + * exit_oom_victim - note the exit of an OOM victim */ -void unmark_oom_victim(void) +void exit_oom_victim(void) { - if (!test_and_clear_thread_flag(TIF_MEMDIE)) - return; + clear_thread_flag(TIF_MEMDIE); - down_read(&oom_sem); - /* - * There is no need to signal the lasst oom_victim if there - * is nobody who cares. - */ - if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); - up_read(&oom_sem); } /** @@ -469,14 +460,14 @@ bool oom_killer_disable(void) * Make sure to not race with an ongoing OOM killer * and that the current is not the victim. */ - down_write(&oom_sem); + mutex_lock(&oom_lock); if (test_thread_flag(TIF_MEMDIE)) { - up_write(&oom_sem); + mutex_unlock(&oom_lock); return false; } oom_killer_disabled = true; - up_write(&oom_sem); + mutex_unlock(&oom_lock); wait_event(oom_victims_wait, !atomic_read(&oom_victims)); @@ -488,9 +479,7 @@ bool oom_killer_disable(void) */ void oom_killer_enable(void) { - down_write(&oom_sem); oom_killer_disabled = false; - up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -517,7 +506,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ task_lock(p); if (p->mm && task_will_free_mem(p)) { - mark_tsk_oom_victim(p); + mark_oom_victim(p); task_unlock(p); put_task_struct(p); return; @@ -528,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, dump_header(p, gfp_mask, order, memcg, nodemask); task_lock(p); - pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); task_unlock(p); @@ -572,7 +561,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; - mark_tsk_oom_victim(victim); + mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), @@ -612,7 +601,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask) + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg) { if (likely(!sysctl_panic_on_oom)) return; @@ -625,7 +615,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - dump_header(NULL, gfp_mask, order, NULL, nodemask); + dump_header(NULL, gfp_mask, order, memcg, nodemask); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -644,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); -/* - * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero - * if a parallel OOM killing is already taking place that includes a zone in - * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. - */ -bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) -{ - struct zoneref *z; - struct zone *zone; - bool ret = true; - - spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { - ret = false; - goto out; - } - - /* - * Lock each zone in the zonelist under zone_scan_lock so a parallel - * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. - */ - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - set_bit(ZONE_OOM_LOCKED, &zone->flags); - -out: - spin_unlock(&zone_scan_lock); - return ret; -} - -/* - * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed - * allocation attempts with zonelists containing them may now recall the OOM - * killer, if necessary. - */ -void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) -{ - struct zoneref *z; - struct zone *zone; - - spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - clear_bit(ZONE_OOM_LOCKED, &zone->flags); - spin_unlock(&zone_scan_lock); -} - /** * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer @@ -703,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; struct task_struct *p; @@ -714,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, enum oom_constraint constraint = CONSTRAINT_NONE; int killed = 0; + if (oom_killer_disabled) + return false; + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ - return; + goto out; /* * If current has a pending SIGKILL or is exiting, then automatically @@ -729,8 +676,8 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { - mark_tsk_oom_victim(current); - return; + mark_oom_victim(current); + goto out; } /* @@ -740,7 +687,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, constraint = constrained_alloc(zonelist, gfp_mask, nodemask, &totalpages); mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && @@ -770,32 +717,8 @@ out: */ if (killed) schedule_timeout_killable(1); -} - -/** - * out_of_memory - tries to invoke OOM killer. - * @zonelist: zonelist pointer - * @gfp_mask: memory allocation flags - * @order: amount of memory being requested as a power of 2 - * @nodemask: nodemask passed to page allocator - * @force_kill: true if a task must be killed, even if others are exiting - * - * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() - * when it returns false. Otherwise returns true. - */ -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) -{ - bool ret = false; - - down_read(&oom_sem); - if (!oom_killer_disabled) { - __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); - ret = true; - } - up_read(&oom_sem); - return ret; + return true; } /* @@ -805,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ void pagefault_out_of_memory(void) { - struct zonelist *zonelist; - - down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - goto unlock; + return; - zonelist = node_zonelist(first_memory_node, GFP_KERNEL); - if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - if (!oom_killer_disabled) - __out_of_memory(NULL, 0, 0, NULL, false); - else - /* - * There shouldn't be any user tasks runable while the - * OOM killer is disabled so the current task has to - * be a racing OOM victim for which oom_killer_disable() - * is waiting for. - */ - WARN_ON(test_thread_flag(TIF_MEMDIE)); + if (!mutex_trylock(&oom_lock)) + return; - oom_zonelist_unlock(zonelist, GFP_KERNEL); + if (!out_of_memory(NULL, 0, 0, NULL, false)) { + /* + * There shouldn't be any user tasks runnable while the + * OOM killer is disabled, so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); } -unlock: - up_read(&oom_sem); + + mutex_unlock(&oom_lock); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 644bcb665773..5cccc127ef81 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -122,31 +122,31 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -unsigned long global_dirty_limit; +struct wb_domain global_wb_domain; -/* - * Scale the writeback cache size proportional to the relative writeout speeds. - * - * We do this by keeping a floating proportion between BDIs, based on page - * writeback completions [end_page_writeback()]. Those devices that write out - * pages fastest will get the larger share, while the slower will get a smaller - * share. - * - * We use page writeout completions because we are interested in getting rid of - * dirty pages. Having them written out is the primary goal. - * - * We introduce a concept of time, a period over which we measure these events, - * because demand can/will vary over time. The length of this period itself is - * measured in page writeback completions. - * - */ -static struct fprop_global writeout_completions; +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; -static void writeout_period(unsigned long t); -/* Timer for aging of writeout_completions */ -static struct timer_list writeout_period_timer = - TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); -static unsigned long writeout_period_time = 0; + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; +}; + +#define DTC_INIT_COMMON(__wb) .wb = (__wb), \ + .wb_completions = &(__wb)->completions /* * Length of period for aging writeout fractions of bdis. This is an @@ -155,6 +155,97 @@ static unsigned long writeout_period_time = 0; */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) +#ifdef CONFIG_CGROUP_WRITEBACK + +#define GDTC_INIT(__wb) .dom = &global_wb_domain, \ + DTC_INIT_COMMON(__wb) +#define GDTC_INIT_NO_WB .dom = &global_wb_domain +#define MDTC_INIT(__wb, __gdtc) .dom = mem_cgroup_wb_domain(__wb), \ + .gdtc = __gdtc, \ + DTC_INIT_COMMON(__wb) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return mdtc->gdtc; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + unsigned long long min = wb->bdi->min_ratio; + unsigned long long max = wb->bdi->max_ratio; + + /* + * @wb may already be clean by the time control reaches here and + * the total may not include its bw. + */ + if (this_bw < tot_bw) { + if (min) { + min *= this_bw; + do_div(min, tot_bw); + } + if (max < 100) { + max *= this_bw; + do_div(max, tot_bw); + } + } + + *minp = min; + *maxp = max; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +#define GDTC_INIT(__wb) DTC_INIT_COMMON(__wb) +#define GDTC_INIT_NO_WB +#define MDTC_INIT(__wb, __gdtc) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return false; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return &global_wb_domain; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return NULL; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + *minp = wb->bdi->min_ratio; + *maxp = wb->bdi->max_ratio; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of @@ -250,42 +341,88 @@ static unsigned long global_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds +/** + * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain + * @dtc: dirty_throttle_control of interest * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * Calculate @dtc->thresh and ->bg_thresh considering + * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller + * must ensure that @dtc->avail is set before calling this function. The + * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and * real-time tasks. */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +static void domain_dirty_limits(struct dirty_throttle_control *dtc) { - const unsigned long available_memory = global_dirtyable_memory(); - unsigned long background; - unsigned long dirty; + const unsigned long available_memory = dtc->avail; + struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); + unsigned long bytes = vm_dirty_bytes; + unsigned long bg_bytes = dirty_background_bytes; + unsigned long ratio = vm_dirty_ratio; + unsigned long bg_ratio = dirty_background_ratio; + unsigned long thresh; + unsigned long bg_thresh; struct task_struct *tsk; - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + /* gdtc is !NULL iff @dtc is for memcg domain */ + if (gdtc) { + unsigned long global_avail = gdtc->avail; + + /* + * The byte settings can't be applied directly to memcg + * domains. Convert them to ratios by scaling against + * globally available memory. + */ + if (bytes) + ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 / + global_avail, 100UL); + if (bg_bytes) + bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 / + global_avail, 100UL); + bytes = bg_bytes = 0; + } + + if (bytes) + thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - dirty = (vm_dirty_ratio * available_memory) / 100; + thresh = (ratio * available_memory) / 100; - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + if (bg_bytes) + bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - background = (dirty_background_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / 100; - if (background >= dirty) - background = dirty / 2; + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; + bg_thresh += bg_thresh / 4; + thresh += thresh / 4; } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); + dtc->thresh = thresh; + dtc->bg_thresh = bg_thresh; + + /* we should eventually report the domain in the TP */ + if (!gdtc) + trace_global_dirty_state(bg_thresh, thresh); +} + +/** + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * @pbackground: out parameter for bg_thresh + * @pdirty: out parameter for thresh + * + * Calculate bg_thresh and thresh for global_wb_domain. See + * domain_dirty_limits() for details. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + + gdtc.avail = global_dirtyable_memory(); + domain_dirty_limits(&gdtc); + + *pbackground = gdtc.bg_thresh; + *pdirty = gdtc.thresh; } /** @@ -392,47 +529,52 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -/* - * Increment the BDI's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). - */ -static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +static void wb_domain_writeout_inc(struct wb_domain *dom, + struct fprop_local_percpu *completions, + unsigned int max_prop_frac) { - __inc_bdi_stat(bdi, BDI_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&dom->completions, completions, + max_prop_frac); /* First event after period switching was turned off? */ - if (!unlikely(writeout_period_time)) { + if (!unlikely(dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ - writeout_period_time = wp_next_time(jiffies); - mod_timer(&writeout_period_timer, writeout_period_time); + dom->period_time = wp_next_time(jiffies); + mod_timer(&dom->period_timer, dom->period_time); } } -void bdi_writeout_inc(struct backing_dev_info *bdi) +/* + * Increment @wb's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __wb_writeout_inc(struct bdi_writeback *wb) +{ + struct wb_domain *cgdom; + + __inc_wb_stat(wb, WB_WRITTEN); + wb_domain_writeout_inc(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac); +} + +void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); - __bdi_writeout_inc(bdi); + __wb_writeout_inc(wb); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(bdi_writeout_inc); - -/* - * Obtain an accurate fraction of the BDI's portion. - */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, - long *numerator, long *denominator) -{ - fprop_fraction_percpu(&writeout_completions, &bdi->completions, - numerator, denominator); -} +EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use @@ -440,22 +582,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, */ static void writeout_period(unsigned long t) { - int miss_periods = (jiffies - writeout_period_time) / + struct wb_domain *dom = (void *)t; + int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; - if (fprop_new_period(&writeout_completions, miss_periods + 1)) { - writeout_period_time = wp_next_time(writeout_period_time + + if (fprop_new_period(&dom->completions, miss_periods + 1)) { + dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); - mod_timer(&writeout_period_timer, writeout_period_time); + mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ - writeout_period_time = 0; + dom->period_time = 0; } } +int wb_domain_init(struct wb_domain *dom, gfp_t gfp) +{ + memset(dom, 0, sizeof(*dom)); + + spin_lock_init(&dom->lock); + + init_timer_deferrable(&dom->period_timer); + dom->period_timer.function = writeout_period; + dom->period_timer.data = (unsigned long)dom; + + dom->dirty_limit_tstamp = jiffies; + + return fprop_global_init(&dom->completions, gfp); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + del_timer_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not @@ -510,17 +676,26 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh, return (thresh + bg_thresh) / 2; } -static unsigned long hard_dirty_limit(unsigned long thresh) +static unsigned long hard_dirty_limit(struct wb_domain *dom, + unsigned long thresh) { - return max(thresh, global_dirty_limit); + return max(thresh, dom->dirty_limit); +} + +/* memory available to a memcg domain is capped by system-wide clean memory */ +static void mdtc_cap_avail(struct dirty_throttle_control *mdtc) +{ + struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); + unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); + + mdtc->avail = min(mdtc->avail, clean); } /** - * bdi_dirty_limit - @bdi's share of dirty throttling threshold - * @bdi: the backing_dev_info to query - * @dirty: global dirty limit in pages + * __wb_calc_thresh - @wb's share of dirty throttling threshold + * @dtc: dirty_throttle_context of interest * - * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * Returns @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. * * Note that balance_dirty_pages() will only seriously take it as a hard limit @@ -528,34 +703,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh) * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks - * more (rather than completely block them) when the bdi dirty pages go high. + * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * - * The bdi's share of dirty limit will be adapting to its throughput and + * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { - u64 bdi_dirty; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + u64 wb_thresh; long numerator, denominator; + unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the dirty ratio. + * Calculate this BDI's share of the thresh ratio. */ - bdi_writeout_fraction(bdi, &numerator, &denominator); + fprop_fraction_percpu(&dom->completions, dtc->wb_completions, + &numerator, &denominator); + + wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh *= numerator; + do_div(wb_thresh, denominator); - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); + wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / 100; + if (wb_thresh > (thresh * wb_max_ratio) / 100) + wb_thresh = thresh * wb_max_ratio / 100; - return bdi_dirty; + return wb_thresh; +} + +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb), + .thresh = thresh }; + return __wb_calc_thresh(&gdtc); } /* @@ -580,7 +768,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, - limit - setpoint + 1); + (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -594,7 +782,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * * (o) global/bdi setpoints * - * We want the dirty pages be balanced around the global/bdi setpoints. + * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. @@ -604,8 +792,8 @@ static long long pos_ratio_polynom(unsigned long setpoint, * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * - * if (bdi_dirty < bdi_setpoint) scale up pos_ratio - * if (bdi_dirty > bdi_setpoint) scale down pos_ratio + * if (wb_dirty < wb_setpoint) scale up pos_ratio + * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * @@ -630,7 +818,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * - * (o) bdi control line + * (o) wb control line * * ^ pos_ratio * | @@ -656,33 +844,32 @@ static long long pos_ratio_polynom(unsigned long setpoint, * | . . * | . . * 0 +----------------------.-------------------------------.-------------> - * bdi_setpoint^ x_intercept^ + * wb_setpoint^ x_intercept^ * - * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can + * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD - * card's bdi_dirty may rush to many times higher than bdi_setpoint. - * - the bdi dirty thresh drops quickly due to change of JBOD workload + * card's wb_dirty may rush to many times higher than wb_setpoint. + * - the wb dirty thresh drops quickly due to change of JBOD workload */ -static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty) -{ - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_position_ratio(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ - unsigned long bdi_setpoint; + unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; - if (unlikely(dirty >= limit)) - return 0; + dtc->pos_ratio = 0; + + if (unlikely(dtc->dirty >= limit)) + return; /* * global setpoint @@ -690,165 +877,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; - pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For - * such filesystems balance_dirty_pages always checks bdi counters - * against bdi limits. Even if global "nr_dirty" is under "freerun". + * such filesystems balance_dirty_pages always checks wb counters + * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * Here, in bdi_position_ratio(), we calculate pos_ratio based on - * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * Here, in wb_position_ratio(), we calculate pos_ratio based on + * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). - * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. - * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is - * about ~6K pages (as the average of background and throttle bdi + * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if - * bdi_dirty is under bdi_setpoint and vice versa. + * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations - * because we want to throttle process writing to a strictlimit BDI + * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - long long bdi_pos_ratio; - unsigned long bdi_bg_thresh; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long wb_pos_ratio; - if (bdi_dirty < 8) - return min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); + if (dtc->wb_dirty < 8) { + dtc->pos_ratio = min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + return; + } - if (bdi_dirty >= bdi_thresh) - return 0; + if (dtc->wb_dirty >= wb_thresh) + return; - bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); - bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, - bdi_bg_thresh); + wb_setpoint = dirty_freerun_ceiling(wb_thresh, + dtc->wb_bg_thresh); - if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) - return 0; + if (wb_setpoint == 0 || wb_setpoint == wb_thresh) + return; - bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, - bdi_thresh); + wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, + wb_thresh); /* - * Typically, for strictlimit case, bdi_setpoint << setpoint - * and pos_ratio >> bdi_pos_ratio. In the other words global + * Typically, for strictlimit case, wb_setpoint << setpoint + * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to - * make decision based on bdi counters. But there is an + * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other - * BDIs) while given strictlimit BDI is below limit. + * wb's) while given strictlimit wb is below limit. * - * "pos_ratio * bdi_pos_ratio" would work for the case above, + * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all - * activity in the system coming from a single strictlimit BDI + * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 - * (when globally we are at freerun and bdi is well below bdi + * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ - return min(pos_ratio, bdi_pos_ratio); + dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); + return; } /* * We have computed basic pos_ratio above based on global situation. If - * the bdi is over/under its share of dirty pages, we want to scale + * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* - * bdi setpoint + * wb setpoint * - * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) + * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * - * x_intercept - bdi_dirty + * x_intercept - wb_dirty * := -------------------------- - * x_intercept - bdi_setpoint + * x_intercept - wb_setpoint * - * The main bdi control line is a linear function that subjects to + * The main wb control line is a linear function that subjects to * - * (1) f(bdi_setpoint) = 1.0 - * (2) k = - 1 / (8 * write_bw) (in single bdi case) - * or equally: x_intercept = bdi_setpoint + 8 * write_bw + * (1) f(wb_setpoint) = 1.0 + * (2) k = - 1 / (8 * write_bw) (in single wb case) + * or equally: x_intercept = wb_setpoint + 8 * write_bw * - * For single bdi case, the dirty pages are observed to fluctuate + * For single wb case, the dirty pages are observed to fluctuate * regularly within range - * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] + * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * - * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its + * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that - * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. + * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ - if (unlikely(bdi_thresh > thresh)) - bdi_thresh = thresh; + if (unlikely(wb_thresh > dtc->thresh)) + wb_thresh = dtc->thresh; /* - * It's very possible that bdi_thresh is close to 0 not because the + * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ - bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* - * scale global setpoint to bdi's: - * bdi_setpoint = setpoint * bdi_thresh / thresh + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh */ - x = div_u64((u64)bdi_thresh << 16, thresh + 1); - bdi_setpoint = setpoint * (u64)x >> 16; + x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; /* - * Use span=(8*write_bw) in single bdi case as indicated by - * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * - * bdi_thresh thresh - bdi_thresh - * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh - * thresh thresh + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh */ - span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; - x_intercept = bdi_setpoint + span; + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; - if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), - x_intercept - bdi_setpoint + 1); + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), + (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* - * bdi reserve area, safeguard against dirty pool underrun and disk idle + * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ - x_intercept = bdi_thresh / 2; - if (bdi_dirty < x_intercept) { - if (bdi_dirty > x_intercept / 8) - pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); + x_intercept = wb_thresh / 2; + if (dtc->wb_dirty < x_intercept) { + if (dtc->wb_dirty > x_intercept / 8) + pos_ratio = div_u64(pos_ratio * x_intercept, + dtc->wb_dirty); else pos_ratio *= 8; } - return pos_ratio; + dtc->pos_ratio = pos_ratio; } -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, - unsigned long elapsed, - unsigned long written) +static void wb_update_write_bandwidth(struct bdi_writeback *wb, + unsigned long elapsed, + unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); - unsigned long avg = bdi->avg_write_bandwidth; - unsigned long old = bdi->write_bandwidth; + unsigned long avg = wb->avg_write_bandwidth; + unsigned long old = wb->write_bandwidth; u64 bw; /* @@ -861,14 +1050,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ - bw = written - min(written, bdi->written_stamp); + bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } - bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* @@ -881,21 +1070,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, avg += (old - avg) >> 3; out: - bdi->write_bandwidth = bw; - bdi->avg_write_bandwidth = avg; + /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ + avg = max(avg, 1LU); + if (wb_has_dirty_io(wb)) { + long delta = avg - wb->avg_write_bandwidth; + WARN_ON_ONCE(atomic_long_add_return(delta, + &wb->bdi->tot_write_bandwidth) <= 0); + } + wb->write_bandwidth = bw; + wb->avg_write_bandwidth = avg; } -/* - * The global dirtyable memory and dirty threshold could be suddenly knocked - * down by a large amount (eg. on the startup of KVM in a swapless system). - * This may throw the system into deep dirty exceeded state and throttle - * heavy/light dirtiers alike. To retain good responsiveness, maintain - * global_dirty_limit for tracking slowly down to the knocked down dirty - * threshold. - */ -static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +static void update_dirty_limit(struct dirty_throttle_control *dtc) { - unsigned long limit = global_dirty_limit; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + unsigned long limit = dom->dirty_limit; /* * Follow up in one step. @@ -908,63 +1098,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce - * global_dirty_limit which is guaranteed to lie above the dirty pages. + * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ - thresh = max(thresh, dirty); + thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: - global_dirty_limit = limit; + dom->dirty_limit = limit; } -static void global_update_bandwidth(unsigned long thresh, - unsigned long dirty, +static void domain_update_bandwidth(struct dirty_throttle_control *dtc, unsigned long now) { - static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time = INITIAL_JIFFIES; + struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ - if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; - spin_lock(&dirty_lock); - if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { - update_dirty_limit(thresh, dirty); - update_time = now; + spin_lock(&dom->lock); + if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { + update_dirty_limit(dtc); + dom->dirty_limit_tstamp = now; } - spin_unlock(&dirty_lock); + spin_unlock(&dom->lock); } /* - * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. + * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * - * Normal bdi tasks will be curbed at or below it in long term. + * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ -static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long dirtied, - unsigned long elapsed) -{ - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, + unsigned long dirtied, + unsigned long elapsed) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long dirty = dtc->dirty; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long dirty_ratelimit = bdi->dirty_ratelimit; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; - unsigned long pos_ratio; unsigned long step; unsigned long x; @@ -972,20 +1156,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ - dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; - pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty); /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * - pos_ratio >> RATELIMIT_CALC_SHIFT; + dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, - * if there are N dd tasks, each throttled at task_ratelimit, the bdi's + * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * @@ -1024,7 +1206,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, /* * We could safely do this and return immediately: * - * bdi->dirty_ratelimit = balanced_dirty_ratelimit; + * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and @@ -1058,32 +1240,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, step = 0; /* - * For strictlimit case, calculations above were based on bdi counters - * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * For strictlimit case, calculations above were based on wb counters + * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). - * Hence, to calculate "step" properly, we have to use bdi_dirty as - * "dirty" and bdi_setpoint as "setpoint". + * Hence, to calculate "step" properly, we have to use wb_dirty as + * "dirty" and wb_setpoint as "setpoint". * - * We rampup dirty_ratelimit forcibly if bdi_dirty is low because - * it's possible that bdi_thresh is close to zero due to inactivity - * of backing device (see the implementation of bdi_dirty_limit()). + * We rampup dirty_ratelimit forcibly if wb_dirty is low because + * it's possible that wb_thresh is close to zero due to inactivity + * of backing device. */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - dirty = bdi_dirty; - if (bdi_dirty < 8) - setpoint = bdi_dirty + 1; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = dtc->wb_dirty; + if (dtc->wb_dirty < 8) + setpoint = dtc->wb_dirty + 1; else - setpoint = (bdi_thresh + - bdi_dirty_limit(bdi, bg_thresh)) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { - x = min3(bdi->balanced_dirty_ratelimit, + x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max3(bdi->balanced_dirty_ratelimit, + x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; @@ -1105,69 +1286,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, else dirty_ratelimit -= step; - bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); - bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); } -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, + struct dirty_throttle_control *mdtc, + unsigned long start_time, + bool update_ratelimit) { + struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; + lockdep_assert_held(&wb->list_lock); + /* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; - dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); - written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); + written = percpu_counter_read(&wb->stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ - if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; - if (thresh) { - global_update_bandwidth(thresh, dirty, now); - bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, - dirtied, elapsed); + if (update_ratelimit) { + domain_update_bandwidth(gdtc, now); + wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); + + /* + * @mdtc is always NULL if !CGROUP_WRITEBACK but the + * compiler has no way to figure that out. Help it. + */ + if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { + domain_update_bandwidth(mdtc, now); + wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); + } } - bdi_update_write_bandwidth(bdi, elapsed, written); + wb_update_write_bandwidth(wb, elapsed, written); snapshot: - bdi->dirtied_stamp = dirtied; - bdi->written_stamp = written; - bdi->bw_time_stamp = now; + wb->dirtied_stamp = dirtied; + wb->written_stamp = written; + wb->bw_time_stamp = now; } -static void bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) - return; - spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, start_time); - spin_unlock(&bdi->wb.list_lock); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + __wb_update_bandwidth(&gdtc, NULL, start_time, false); } /* @@ -1187,10 +1366,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long wb_max_pause(struct bdi_writeback *wb, + unsigned long wb_dirty) { - unsigned long bw = bdi->avg_write_bandwidth; + unsigned long bw = wb->avg_write_bandwidth; unsigned long t; /* @@ -1200,20 +1379,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } -static long bdi_min_pause(struct backing_dev_info *bdi, - long max_pause, - unsigned long task_ratelimit, - unsigned long dirty_ratelimit, - int *nr_dirtied_pause) +static long wb_min_pause(struct bdi_writeback *wb, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) { - long hi = ilog2(bdi->avg_write_bandwidth); - long lo = ilog2(bdi->dirty_ratelimit); + long hi = ilog2(wb->avg_write_bandwidth); + long lo = ilog2(wb->dirty_ratelimit); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1281,34 +1460,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } -static inline void bdi_dirty_limits(struct backing_dev_info *bdi, - unsigned long dirty_thresh, - unsigned long background_thresh, - unsigned long *bdi_dirty, - unsigned long *bdi_thresh, - unsigned long *bdi_bg_thresh) +static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { - unsigned long bdi_reclaimable; + struct bdi_writeback *wb = dtc->wb; + unsigned long wb_reclaimable; /* - * bdi_thresh is not treated as some limiting factor as + * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. + * go into state (wb_dirty >> wb_thresh) either because + * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + * dirtiers for 100 seconds until wb_dirty drops under + * wb_thresh. Instead the auxiliary wb control line in + * wb_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - - if (bdi_bg_thresh) - *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * - background_thresh, - dirty_thresh) : 0; + dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_bg_thresh = dtc->thresh ? + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1320,14 +1492,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); + if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { + wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); + wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } @@ -1339,12 +1509,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, + struct bdi_writeback *wb, unsigned long pages_dirtied) { + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long background_thresh; - unsigned long dirty_thresh; long period; long pause; long max_pause; @@ -1353,18 +1527,14 @@ static void balance_dirty_pages(struct address_space *mapping, bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; - unsigned long pos_ratio; - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; - unsigned long uninitialized_var(bdi_thresh); - unsigned long thresh; - unsigned long uninitialized_var(bdi_dirty); - unsigned long dirty; - unsigned long bg_thresh; + unsigned long dirty, thresh, bg_thresh; + unsigned long m_dirty, m_thresh, m_bg_thresh; /* * Unstable writes are a feature of certain networked @@ -1374,65 +1544,127 @@ static void balance_dirty_pages(struct address_space *mapping, */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); - global_dirty_limits(&background_thresh, &dirty_thresh); + domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, &bg_thresh); + wb_dirty_limits(gdtc); - dirty = bdi_dirty; - thresh = bdi_thresh; + dirty = gdtc->wb_dirty; + thresh = gdtc->wb_thresh; + bg_thresh = gdtc->wb_bg_thresh; } else { - dirty = nr_dirty; - thresh = dirty_thresh; - bg_thresh = background_thresh; + dirty = gdtc->dirty; + thresh = gdtc->thresh; + bg_thresh = gdtc->bg_thresh; + } + + if (mdtc) { + unsigned long writeback; + + /* + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. + */ + mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, + &writeback); + mdtc_cap_avail(mdtc); + mdtc->dirty += writeback; + + domain_dirty_limits(mdtc); + + if (unlikely(strictlimit)) { + wb_dirty_limits(mdtc); + m_dirty = mdtc->wb_dirty; + m_thresh = mdtc->wb_thresh; + m_bg_thresh = mdtc->wb_bg_thresh; + } else { + m_dirty = mdtc->dirty; + m_thresh = mdtc->thresh; + m_bg_thresh = mdtc->bg_thresh; + } } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up in case of !strictlimit. + * when the wb limits are ramping up in case of !strictlimit. * - * In strictlimit case make decision based on the bdi counters - * and limits. Small writeouts when the bdi limits are ramping + * In strictlimit case make decision based on the wb counters + * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. + * + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && + (!mdtc || + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + unsigned long intv = dirty_poll_interval(dirty, thresh); + unsigned long m_intv = ULONG_MAX; + current->dirty_paused_when = now; current->nr_dirtied = 0; - current->nr_dirtied_pause = - dirty_poll_interval(dirty, thresh); + if (mdtc) + m_intv = dirty_poll_interval(m_dirty, m_thresh); + current->nr_dirtied_pause = min(intv, m_intv); break; } - if (unlikely(!writeback_in_progress(bdi))) - bdi_start_background_writeback(bdi); + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + /* + * Calculate global domain's pos_ratio and select the + * global dtc by default. + */ if (!strictlimit) - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, NULL); - - dirty_exceeded = (bdi_dirty > bdi_thresh) && - ((nr_dirty > dirty_thresh) || strictlimit); - if (dirty_exceeded && !bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, - nr_dirty, bdi_thresh, bdi_dirty, - start_time); - - dirty_ratelimit = bdi->dirty_ratelimit; - pos_ratio = bdi_position_ratio(bdi, dirty_thresh, - background_thresh, nr_dirty, - bdi_thresh, bdi_dirty); - task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> + wb_dirty_limits(gdtc); + + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && + ((gdtc->dirty > gdtc->thresh) || strictlimit); + + wb_position_ratio(gdtc); + sdtc = gdtc; + + if (mdtc) { + /* + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. + */ + if (!strictlimit) + wb_dirty_limits(mdtc); + + dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && + ((mdtc->dirty > mdtc->thresh) || strictlimit); + + wb_position_ratio(mdtc); + if (mdtc->pos_ratio < gdtc->pos_ratio) + sdtc = mdtc; + } + + if (dirty_exceeded && !wb->dirty_exceeded) + wb->dirty_exceeded = 1; + + if (time_is_before_jiffies(wb->bw_time_stamp + + BANDWIDTH_INTERVAL)) { + spin_lock(&wb->list_lock); + __wb_update_bandwidth(gdtc, mdtc, start_time, true); + spin_unlock(&wb->list_lock); + } + + /* throttle according to the chosen dtc */ + dirty_ratelimit = wb->dirty_ratelimit; + task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; - max_pause = bdi_max_pause(bdi, bdi_dirty); - min_pause = bdi_min_pause(bdi, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); + max_pause = wb_max_pause(wb, sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; @@ -1452,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping, */ if (pause < min_pause) { trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1481,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping, pause: trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1500,33 +1732,33 @@ pause: current->nr_dirtied_pause = nr_dirtied_pause; /* - * This is typically equal to (nr_dirty < dirty_thresh) and can - * also keep "1000+ dd on a slow USB stick" under control. + * This is typically equal to (dirty < thresh) and can also + * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponding NFS server and the NFS dirty - * pages exceeds dirty_thresh, give the other good bdi's a pipe + * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the comsumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 - * more page. However bdi_dirty has accounting errors. So use - * the larger and more IO friendly bdi_stat_error. + * more page. However wb_dirty has accounting errors. So use + * the larger and more IO friendly wb_stat_error. */ - if (bdi_dirty <= bdi_stat_error(bdi)) + if (sdtc->wb_dirty <= wb_stat_error(wb)) break; if (fatal_signal_pending(current)) break; } - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; + if (!dirty_exceeded && wb->dirty_exceeded) + wb->dirty_exceeded = 0; - if (writeback_in_progress(bdi)) + if (writeback_in_progress(wb)) return; /* @@ -1540,8 +1772,8 @@ pause: if (laptop_mode) return; - if (nr_reclaimable > background_thresh) - bdi_start_background_writeback(bdi); + if (nr_reclaimable > gdtc->bg_thresh) + wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1577,15 +1809,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; int ratelimit; int *p; if (!bdi_cap_account_dirty(bdi)) return; + if (inode_cgwb_enabled(inode)) + wb = wb_get_create_current(bdi, GFP_KERNEL); + if (!wb) + wb = &bdi->wb; + ratelimit = current->nr_dirtied_pause; - if (bdi->dirty_exceeded) + if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); @@ -1617,10 +1856,59 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, current->nr_dirtied); + balance_dirty_pages(mapping, wb, current->nr_dirtied); + + wb_put(wb); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. Returns %true if writeback should continue. + */ +bool wb_over_bg_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + + /* + * Similar to balance_dirty_pages() but ignores pages being written + * as we're trying to decide whether to put more under writeback. + */ + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + domain_dirty_limits(gdtc); + + if (gdtc->dirty > gdtc->bg_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + return true; + + if (mdtc) { + unsigned long writeback; + + mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback); + mdtc_cap_avail(mdtc); + domain_dirty_limits(mdtc); /* ditto, ignore writeback */ + + if (mdtc->dirty > mdtc->bg_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + return true; + } + + return false; +} + void throttle_vm_writeout(gfp_t gfp_mask) { unsigned long background_thresh; @@ -1628,7 +1916,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) for ( ; ; ) { global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(dirty_thresh); + dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -1667,14 +1955,20 @@ void laptop_mode_timer_fn(unsigned long data) struct request_queue *q = (struct request_queue *)data; int nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); + struct bdi_writeback *wb; + struct wb_iter iter; /* * We want to write everything out, not just down to the dirty * threshold */ - if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, nr_pages, - WB_REASON_LAPTOP_TIMER); + if (!bdi_has_dirty_io(&q->backing_dev_info)) + return; + + bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0) + if (wb_has_dirty_io(wb)) + wb_start_writeback(wb, nr_pages, true, + WB_REASON_LAPTOP_TIMER); } /* @@ -1718,10 +2012,12 @@ void laptop_sync_completion(void) void writeback_set_ratelimit(void) { + struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; + global_dirty_limits(&background_thresh, &dirty_thresh); - global_dirty_limit = dirty_thresh; + dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; @@ -1767,10 +2063,10 @@ static struct notifier_block ratelimit_nb = { */ void __init page_writeback_init(void) { + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); + writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - - fprop_global_init(&writeout_completions, GFP_KERNEL); } /** @@ -2090,19 +2386,29 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. + * + * Caller must hold mem_cgroup_begin_page_stat(). + * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg) { + struct inode *inode = mapping->host; + trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb; + inode_attach_wb(inode, page); + wb = inode_to_wb(inode); + + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(bdi, BDI_RECLAIMABLE); - __inc_bdi_stat(bdi, BDI_DIRTIED); + __inc_wb_stat(wb, WB_RECLAIMABLE); + __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2111,6 +2417,22 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) EXPORT_SYMBOL(account_page_dirtied); /* + * Helper function for deaccounting dirty page without writeback. + * + * Caller must hold mem_cgroup_begin_page_stat(). + */ +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, struct bdi_writeback *wb) +{ + if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_wb_stat(wb, WB_RECLAIMABLE); + task_io_account_cancelled_write(PAGE_CACHE_SIZE); + } +} + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -2124,26 +2446,34 @@ EXPORT_SYMBOL(account_page_dirtied); */ int __set_page_dirty_nobuffers(struct page *page) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; - if (!mapping) + if (!mapping) { + mem_cgroup_end_page_stat(memcg); return 1; + } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); + if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } + mem_cgroup_end_page_stat(memcg); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2158,10 +2488,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + bool locked; + + wb = unlocked_inode_to_wb_begin(inode, &locked); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); + dec_wb_stat(wb, WB_DIRTIED); + unlocked_inode_to_wb_end(inode, locked); } } EXPORT_SYMBOL(account_page_redirty); @@ -2209,7 +2546,8 @@ int set_page_dirty(struct page *page) * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ - ClearPageReclaim(page); + if (PageReclaim(page)) + ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; @@ -2246,6 +2584,43 @@ int set_page_dirty_lock(struct page *page) EXPORT_SYMBOL(set_page_dirty_lock); /* + * This cancels just the dirty bit on the kernel page itself, it does NOT + * actually remove dirty bits on any mmap's that may be around. It also + * leaves the page tagged dirty, so any sync activity will still find it on + * the dirty lists, and in particular, clear_page_dirty_for_io() will still + * look at the dirty bits in the VM. + * + * Doing this should *normally* only ever be done when a page is truncated, + * and is not actually mapped anywhere at all. However, fs/buffer.c does + * this when it notices that somebody has cleaned out all the buffers on a + * page without actually doing it through the VM. Can you say "ext3 is + * horribly ugly"? Thought you could. + */ +void cancel_dirty_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + + memcg = mem_cgroup_begin_page_stat(page); + wb = unlocked_inode_to_wb_begin(inode, &locked); + + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping, memcg, wb); + + unlocked_inode_to_wb_end(inode, locked); + mem_cgroup_end_page_stat(memcg); + } else { + ClearPageDirty(page); + } +} +EXPORT_SYMBOL(cancel_dirty_page); + +/* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * @@ -2262,10 +2637,16 @@ EXPORT_SYMBOL(set_page_dirty_lock); int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); + int ret = 0; BUG_ON(!PageLocked(page)); if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + /* * Yes, Virginia, this is indeed insane. * @@ -2301,13 +2682,17 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ + memcg = mem_cgroup_begin_page_stat(page); + wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - return 1; + dec_wb_stat(wb, WB_RECLAIMABLE); + ret = 1; } - return 0; + unlocked_inode_to_wb_end(inode, locked); + mem_cgroup_end_page_stat(memcg); + return ret; } return TestClearPageDirty(page); } @@ -2321,7 +2706,8 @@ int test_clear_page_writeback(struct page *page) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2331,8 +2717,10 @@ int test_clear_page_writeback(struct page *page) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { - __dec_bdi_stat(bdi, BDI_WRITEBACK); - __bdi_writeout_inc(bdi); + struct bdi_writeback *wb = inode_to_wb(inode); + + __dec_wb_stat(wb, WB_WRITEBACK); + __wb_writeout_inc(wb); } } spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -2356,7 +2744,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2366,7 +2755,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_bdi_stat(bdi, BDI_WRITEBACK); + __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40e29429e7b0..beda41710802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,6 +61,7 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> #include <linux/page_owner.h> +#include <linux/kthread.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -235,6 +236,75 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __meminit early_page_uninitialised(unsigned long pfn) +{ + if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) + return true; + + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -380,20 +450,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -static inline void prep_zero_page(struct page *page, unsigned int order, - gfp_t gfp_flags) -{ - int i; - - /* - * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO - * and __GFP_HIGHMEM from hard or soft interrupt context. - */ - VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly; @@ -778,6 +834,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return 0; } +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + set_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, + int nid) +{ + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + break; + } + __init_single_pfn(pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + SetPageReserved(page); + } + } +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); @@ -832,7 +957,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -852,6 +978,235 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) + +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + static DEFINE_SPINLOCK(early_pfn_lock); + int nid; + + spin_lock(&early_pfn_lock); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid < 0) + nid = 0; + spin_unlock(&early_pfn_lock); + + return nid; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __init deferred_free_range(struct page *page, + unsigned long pfn, int nr_pages) +{ + int i; + + if (!page) + return; + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == MAX_ORDER_NR_PAGES && + (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_boot_core(page, pfn, MAX_ORDER-1); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) + __free_pages_boot_core(page, pfn, 0); +} + +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} + +/* Initialise remaining memory on a node */ +static int __init deferred_init_memmap(void *data) +{ + pg_data_t *pgdat = data; + int nid = pgdat->node_id; + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long start = jiffies; + unsigned long nr_pages = 0; + unsigned long walk_start, walk_end; + int i, zid; + struct zone *zone; + unsigned long first_init_pfn = pgdat->first_deferred_pfn; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (first_init_pfn == ULONG_MAX) { + pgdat_init_report_one_done(); + return 0; + } + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { + unsigned long pfn, end_pfn; + struct page *page = NULL; + struct page *free_base_page = NULL; + unsigned long free_base_pfn = 0; + int nr_to_free = 0; + + end_pfn = min(walk_end, zone_end_pfn(zone)); + pfn = first_init_pfn; + if (pfn < walk_start) + pfn = walk_start; + if (pfn < zone->zone_start_pfn) + pfn = zone->zone_start_pfn; + + for (; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + goto free_range; + + /* + * Ensure pfn_valid is checked every + * MAX_ORDER_NR_PAGES for memory holes + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(pfn)) { + page = NULL; + goto free_range; + } + } + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + page = NULL; + goto free_range; + } + + /* Minimise pfn page lookups and scheduler checks */ + if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + page++; + } else { + nr_pages += nr_to_free; + deferred_free_range(free_base_page, + free_base_pfn, nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + + page = pfn_to_page(pfn); + cond_resched(); + } + + if (page->flags) { + VM_BUG_ON(page_zone(page) != zone); + goto free_range; + } + + __init_single_page(page, pfn, zid, nid); + if (!free_base_page) { + free_base_page = page; + free_base_pfn = pfn; + nr_to_free = 0; + } + nr_to_free++; + + /* Where possible, batch up pages for a single free */ + continue; +free_range: + /* Free the current block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, + nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + } + + first_init_pfn = max(end_pfn, first_init_pfn); + } + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, + jiffies_to_msecs(jiffies - start)); + + pgdat_init_report_one_done(); + return 0; +} + +void __init page_alloc_init_late(void) +{ + int nid; + + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); + for_each_node_state(nid, N_MEMORY) { + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -941,6 +1296,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; @@ -975,7 +1334,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -1032,11 +1392,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #ifdef CONFIG_CMA - [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ -#else - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION @@ -1044,6 +1402,17 @@ static int fallbacks[MIGRATE_TYPES][4] = { #endif }; +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) +{ + return __rmqueue_smallest(zone, order, MIGRATE_CMA); +} +#else +static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) { return NULL; } +#endif + /* * Move the free pages in a range to the free lists of the requested type. * Note that start_page and end_pages are not aligned on a pageblock @@ -1136,14 +1505,40 @@ static void change_pageblock_range(struct page *pageblock_page, * as fragmentation caused by those allocations polluting movable pageblocks * is worse than movable allocations stealing from unmovable and reclaimable * pageblocks. - * - * If we claim more than half of the pageblock, change pageblock's migratetype - * as well. */ -static void try_to_steal_freepages(struct zone *zone, struct page *page, - int start_type, int fallback_type) +static bool can_steal_fallback(unsigned int order, int start_mt) +{ + /* + * Leaving this order check is intended, although there is + * relaxed order check in next check. The reason is that + * we can actually steal whole pageblock if this condition met, + * but, below check doesn't guarantee it and that is just heuristic + * so could be changed anytime. + */ + if (order >= pageblock_order) + return true; + + if (order >= pageblock_order / 2 || + start_mt == MIGRATE_RECLAIMABLE || + start_mt == MIGRATE_UNMOVABLE || + page_group_by_mobility_disabled) + return true; + + return false; +} + +/* + * This function implements actual steal behaviour. If order is large enough, + * we can steal whole pageblock. If not, we first move freepages in this + * pageblock and check whether half of pages are moved or not. If half of + * pages are moved, we can change migratetype of pageblock and permanently + * use it's pages as requested migratetype in the future. + */ +static void steal_suitable_fallback(struct zone *zone, struct page *page, + int start_type) { int current_order = page_order(page); + int pages; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -1151,19 +1546,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page, return; } - if (current_order >= pageblock_order / 2 || - start_type == MIGRATE_RECLAIMABLE || - start_type == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) { - int pages; + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) + set_pageblock_migratetype(page, start_type); +} + +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) +{ + int i; + int fallback_mt; + + if (area->nr_free == 0) + return -1; + + *can_steal = false; + for (i = 0;; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (fallback_mt == MIGRATE_RESERVE) + break; + + if (list_empty(&area->free_list[fallback_mt])) + continue; + + if (can_steal_fallback(order, migratetype)) + *can_steal = true; - pages = move_freepages_block(zone, page, start_type); + if (!only_stealable) + return fallback_mt; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, start_type); + if (*can_steal) + return fallback_mt; } + + return -1; } /* Remove an element from the buddy allocator from the fallback list */ @@ -1173,64 +1598,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct free_area *area; unsigned int current_order; struct page *page; + int fallback_mt; + bool can_steal; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { - int i; - for (i = 0;; i++) { - int migratetype = fallbacks[start_migratetype][i]; - int buddy_type = start_migratetype; - - /* MIGRATE_RESERVE handled later if necessary */ - if (migratetype == MIGRATE_RESERVE) - break; - - area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, - struct page, lru); - area->nr_free--; - - if (!is_migrate_cma(migratetype)) { - try_to_steal_freepages(zone, page, - start_migratetype, - migratetype); - } else { - /* - * When borrowing from MIGRATE_CMA, we need to - * release the excess buddy pages to CMA - * itself, and we do not try to steal extra - * free pages. - */ - buddy_type = migratetype; - } + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt == -1) + continue; - /* Remove the page from the freelists */ - list_del(&page->lru); - rmv_page_order(page); + page = list_entry(area->free_list[fallback_mt].next, + struct page, lru); + if (can_steal) + steal_suitable_fallback(zone, page, start_migratetype); - expand(zone, page, order, current_order, area, - buddy_type); + /* Remove the page from the freelists */ + area->nr_free--; + list_del(&page->lru); + rmv_page_order(page); - /* - * The freepage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. - */ - set_freepage_migratetype(page, buddy_type); + expand(zone, page, order, current_order, area, + start_migratetype); + /* + * The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. + */ + set_freepage_migratetype(page, start_migratetype); - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); - return page; - } + return page; } return NULL; @@ -1249,7 +1655,11 @@ retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { - page = __rmqueue_fallback(zone, order, migratetype); + if (migratetype == MIGRATE_MOVABLE) + page = __rmqueue_cma_fallback(zone, order); + + if (!page) + page = __rmqueue_fallback(zone, order, migratetype); /* * Use MIGRATE_RESERVE rather than fail an allocation. goto @@ -1321,7 +1731,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) int to_drain, batch; local_irq_save(flags); - batch = ACCESS_ONCE(pcp->batch); + batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); @@ -1520,7 +1930,7 @@ void free_hot_cold_page(struct page *page, bool cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - unsigned long batch = ACCESS_ONCE(pcp->batch); + unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); pcp->count -= batch; } @@ -1553,6 +1963,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; + gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -1566,10 +1977,11 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - set_page_owner(page, 0, 0); + gfp_mask = get_page_owner_gfp(page); + set_page_owner(page, 0, gfp_mask); for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); - set_page_owner(page + i, 0, 0); + set_page_owner(page + i, 0, gfp_mask); } } EXPORT_SYMBOL_GPL(split_page); @@ -1599,6 +2011,8 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); + set_page_owner(page, order, __GFP_MOVABLE); + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; @@ -1610,7 +2024,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } } - set_page_owner(page, order, 0); + return 1UL << order; } @@ -2272,48 +2686,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) show_mem(filter); } -static inline int -should_alloc_retry(gfp_t gfp_mask, unsigned int order, - unsigned long did_some_progress, - unsigned long pages_reclaimed) -{ - /* Do not loop if specifically requested */ - if (gfp_mask & __GFP_NORETRY) - return 0; - - /* Always retry if specifically requested */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - - /* - * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim - * making forward progress without invoking OOM. Suspend also disables - * storage devices so kswapd will not help. Bail if we are suspending. - */ - if (!did_some_progress && pm_suspended_storage()) - return 0; - - /* - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - */ - if (order <= PAGE_ALLOC_COSTLY_ORDER) - return 1; - - /* - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ - if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) - return 1; - - return 0; -} - static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -2323,10 +2695,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; /* - * Acquire the per-zone oom lock for each zone. If that - * fails, somebody else is making progress for us. + * Acquire the oom lock. If that fails, somebody else is + * making progress for us. */ - if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { + if (!mutex_trylock(&oom_lock)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; @@ -2352,23 +2724,19 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for light reclaim */ + /* The OOM killer does not compensate for IO-less reclaim */ if (!(gfp_mask & __GFP_FS)) { /* * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but - * keep looping as per should_alloc_retry(). + * keep looping as per tradition. */ *did_some_progress = 1; goto out; } - /* - * GFP_THISNODE contains __GFP_NORETRY and we never hit this. - * Sanity check for bare calls of __GFP_THISNODE, not real OOM. - * The caller should handle page allocation failure by itself if - * it specifies __GFP_THISNODE. - * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. - */ + if (pm_suspended_storage()) + goto out; + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; } @@ -2377,7 +2745,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: - oom_zonelist_unlock(ac->zonelist, gfp_mask); + mutex_unlock(&oom_lock); return page; } @@ -2623,15 +2991,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* - * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and - * __GFP_NOWARN set) should not cause reclaim since the subsystem - * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim - * using a larger set of nodes after it has established that the - * allowed per node queues are empty and that nodes are - * over allocated. + * If this allocation cannot block and it is for a specific node, then + * fail early. There's no need to wakeup kswapd or retry for a + * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) goto nopage; retry: @@ -2754,40 +3118,40 @@ retry: if (page) goto got_pg; - /* Check if we should retry the allocation */ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + goto noretry; + + /* Keep reclaiming pages as long as there is reasonable progress */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, did_some_progress, - pages_reclaimed)) { - /* - * If we fail to make progress by freeing individual - * pages, but the allocation wants us to keep going, - * start OOM killing tasks. - */ - if (!did_some_progress) { - page = __alloc_pages_may_oom(gfp_mask, order, ac, - &did_some_progress); - if (page) - goto got_pg; - if (!did_some_progress) - goto nopage; - } + if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || + ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; - } else { - /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary - */ - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); - if (page) - goto got_pg; } + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); + if (page) + goto got_pg; + + /* Retry as long as the OOM killer is making progress */ + if (did_some_progress) + goto retry; + +noretry: + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, + ac, migration_mode, + &contended_compaction, + &deferred_compaction); + if (page) + goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: @@ -2824,7 +3188,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result - * of GFP_THISNODE and a memoryless node + * of __GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone)) return NULL; @@ -2927,6 +3291,104 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* + * Page Fragment: + * An arbitrary-length arbitrary-offset area of memory which resides + * within a 0 or higher order page. Multiple fragments within that page + * are individually refcounted, in the page's reference counter. + * + * The page_frag functions below provide a simple allocation framework for + * page fragments. This is used by the network stack and network device + * drivers to provide a backing region of memory for use as either an + * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. + */ +static struct page *__page_frag_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) +{ + struct page *page = NULL; + gfp_t gfp = gfp_mask; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, + PAGE_FRAG_CACHE_MAX_ORDER); + nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; +#endif + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->va = page ? page_address(page) : NULL; + + return page; +} + +void *__alloc_page_frag(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) +{ + unsigned int size = PAGE_SIZE; + struct page *page; + int offset; + + if (unlikely(!nc->va)) { +refill: + page = __page_frag_refill(nc, gfp_mask); + if (!page) + return NULL; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + atomic_add(size - 1, &page->_count); + + /* reset page count bias and offset to start of new frag */ + nc->pfmemalloc = page->pfmemalloc; + nc->pagecnt_bias = size; + nc->offset = size; + } + + offset = nc->offset - fragsz; + if (unlikely(offset < 0)) { + page = virt_to_page(nc->va); + + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + goto refill; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* OK, page count is 0, we can safely set it */ + atomic_set(&page->_count, size); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + offset = size - fragsz; + } + + nc->pagecnt_bias--; + nc->offset = offset; + + return nc->va + offset; +} +EXPORT_SYMBOL(__alloc_page_frag); + +/* + * Frees a page fragment allocated out of either a compound or order 0 page. + */ +void __free_page_frag(void *addr) +{ + struct page *page = virt_to_head_page(addr); + + if (unlikely(put_page_testzero(page))) + __free_pages_ok(page, compound_order(page)); +} +EXPORT_SYMBOL(__free_page_frag); + +/* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter * of the current memory cgroup. * @@ -3201,38 +3663,31 @@ static void show_migration_types(unsigned char type) * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. - * Suppresses nodes that are not allowed by current's cpuset if - * SHOW_MEM_FILTER_NODES is passed. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. */ void show_free_areas(unsigned int filter) { + unsigned long free_pcp = 0; int cpu; struct zone *zone; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; - show_node(zone); - printk("%s per-cpu:\n", zone->name); - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pageset; - - pageset = per_cpu_ptr(zone->pageset, cpu); - printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", - cpu, pageset->pcp.high, - pageset->pcp.batch, pageset->pcp.count); - } + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" - " free_cma:%lu\n", + " free:%lu free_pcp:%lu free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -3243,13 +3698,14 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_PAGES), + free_pcp, global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { @@ -3257,6 +3713,11 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + show_node(zone); printk("%s" " free:%lukB" @@ -3283,6 +3744,8 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" @@ -3314,6 +3777,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), K(zone_page_state(zone, NR_PAGES_SCANNED)), @@ -4062,6 +4527,9 @@ static void setup_zone_migrate_reserve(struct zone *zone) zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) + return; + if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); @@ -4124,15 +4592,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - struct page *page; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4144,14 +4613,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } - page = pfn_to_page(pfn); - set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); - init_page_count(page); - page_mapcount_reset(page); - page_cpupid_reset_last(page); - SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -4166,17 +4632,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (!(pfn & (pageblock_nr_pages - 1))) { + struct page *page = pfn_to_page(pfn); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(pfn << PAGE_SHIFT)); -#endif + __init_single_page(page, pfn, zone, nid); + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } else { + __init_single_pfn(pfn, zone, nid); + } } } @@ -4434,57 +4897,30 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0 && nid != node) - return false; - return true; -} -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -4726,22 +5162,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long realtotalpages, totalpages = 0; + unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - zones_size); - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= - zone_absent_pages_in_node(pgdat->node_id, i, + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long size, real_size; + + size = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zones_size); + real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + zone->spanned_pages = size; + zone->present_pages = real_size; + + totalpages += size; + realtotalpages += real_size; + } + + pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4851,8 +5293,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, unsigned long node_end_pfn) { enum zone_type j; int nid = pgdat->node_id; @@ -4873,12 +5314,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, node_start_pfn, - node_end_pfn, zones_size); - realsize = freesize = size - zone_absent_pages_in_node(nid, j, - node_start_pfn, - node_end_pfn, - zholes_size); + size = zone->spanned_pages; + realsize = freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory @@ -4913,8 +5350,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; - zone->spanned_pages = size; - zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. @@ -5003,6 +5438,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5020,8 +5456,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5717,7 +6152,7 @@ static void __setup_per_zone_wmarks(void) * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas controls asynch page reclaim, and so should + * deltas control asynch page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; @@ -5970,9 +6405,9 @@ out: return ret; } +#ifdef CONFIG_NUMA int hashdist = HASHDIST_DEFAULT; -#ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) @@ -6164,7 +6599,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, mask <<= (BITS_PER_LONG - bitidx - 1); flags <<= (BITS_PER_LONG - bitidx - 1); - word = ACCESS_ONCE(bitmap[word_bitidx]); + word = READ_ONCE(bitmap[word_bitidx]); for (;;) { old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); if (word == old_word) diff --git a/mm/page_io.c b/mm/page_io.c index e6045804c8d8..520baa4b04d7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,8 +20,8 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/frontswap.h> -#include <linux/aio.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -69,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -void end_swap_bio_read(struct bio *bio, int err) +static void end_swap_bio_read(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -274,13 +274,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); - kiocb.ki_nbytes = PAGE_SIZE; set_page_writeback(page); unlock_page(page); - ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE, - &kiocb, &from, - kiocb.ki_pos); + ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos); if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 755a42c76eb4..303c908790ef 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); - if (!is_migrate_isolate_page(buddy)) { + if (pfn_valid_within(page_to_pfn(buddy)) && + !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); kernel_map_pages(page, (1 << order), 1); set_page_refcounted(page); diff --git a/mm/page_owner.c b/mm/page_owner.c index 0993f5f36b01..983c3a10fa07 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +gfp_t __get_page_owner_gfp(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + return page_ext->gfp_mask; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) @@ -310,4 +317,4 @@ static int __init pageowner_init(void) return 0; } -module_init(pageowner_init) +late_initcall(pageowner_init) diff --git a/mm/percpu.c b/mm/percpu.c index 73c97a5f4495..2dd74487a0af 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1030,7 +1030,7 @@ area_found: memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); - kmemleak_alloc_percpu(ptr, size); + kmemleak_alloc_percpu(ptr, size, gfp); return ptr; fail_unlock: @@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr) * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * - * The addr can be tranlated simply without checking if it falls into the + * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current @@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup); * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * - * Groups are always mutliples of atom size and CPUs which are of + * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c25f94b33811..6b674e00153c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, } #endif -#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + VM_BUG_ON(!pmd_trans_huge(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } @@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef pmdp_collapse_flush +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * pmd and hugepage pte format are same. So we could + * use the same function. + */ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index b1597690530c..e88d071648c2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -257,22 +257,18 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec *iov_r = iovstack_r; struct iov_iter iter; ssize_t rc; + int dir = vm_write ? WRITE : READ; if (flags != 0) return -EINVAL; /* Check iovecs */ - if (vm_write) - rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, - iovstack_l, &iov_l); - else - rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, - iovstack_l, &iov_l); - if (rc <= 0) + rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); + if (rc < 0) + return rc; + if (!iov_iter_count(&iter)) goto free_iovecs; - iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); - rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) @@ -283,8 +279,7 @@ static ssize_t process_vm_rw(pid_t pid, free_iovecs: if (iov_r != iovstack_r) kfree(iov_r); - if (iov_l != iovstack_l) - kfree(iov_l); + kfree(iov_l); return rc; } @@ -320,21 +315,16 @@ compat_process_vm_rw(compat_pid_t pid, struct iovec *iov_r = iovstack_r; struct iov_iter iter; ssize_t rc = -EFAULT; + int dir = vm_write ? WRITE : READ; if (flags != 0) return -EINVAL; - if (vm_write) - rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, - UIO_FASTIOV, iovstack_l, - &iov_l); - else - rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, - UIO_FASTIOV, iovstack_l, - &iov_l); - if (rc <= 0) + rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); + if (rc < 0) + return rc; + if (!iov_iter_count(&iter)) goto free_iovecs; - iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); @@ -346,8 +336,7 @@ compat_process_vm_rw(compat_pid_t pid, free_iovecs: if (iov_r != iovstack_r) kfree(iov_r); - if (iov_l != iovstack_l) - kfree(iov_l); + kfree(iov_l); return rc; } diff --git a/mm/readahead.c b/mm/readahead.c index 935675844b2e..60cd846a9a44 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(inode_to_bdi(mapping->host))) + if (inode_read_congested(mapping->host)) return; /* do read-ahead */ diff --git a/mm/rmap.c b/mm/rmap.c index c161a14b6a8f..171b68768df1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -30,6 +30,8 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) + * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * mapping->tree_lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) @@ -456,7 +458,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(page->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) @@ -500,14 +502,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(page->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - root_anon_vma = ACCESS_ONCE(anon_vma->root); + root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still @@ -625,7 +627,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pmd = pmd_offset(pud, address); /* - * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ @@ -950,7 +952,12 @@ void page_move_anon_rmap(struct page *page, VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; + /* + * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * simultaneously, so a concurrent reader (eg page_referenced()'s + * PageAnon()) will not see one without the other. + */ + WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); } /** diff --git a/mm/shmem.c b/mm/shmem.c index cf2d0ca010bc..dbe0c1e8349c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,7 +31,7 @@ #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> -#include <linux/aio.h> +#include <linux/uio.h> static struct vfsmount *shm_mnt; @@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; @@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } - if (newsize < oldsize) { + if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); shmem_truncate_range(inode, newsize, (loff_t)-1); @@ -2274,7 +2274,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int ret; /* @@ -2298,7 +2298,7 @@ out: static int shmem_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb); @@ -2315,7 +2315,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } @@ -2336,8 +2336,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru } old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = - old_dentry->d_inode->i_ctime = - new_dentry->d_inode->i_ctime = CURRENT_TIME; + d_inode(old_dentry)->i_ctime = + d_inode(new_dentry)->i_ctime = CURRENT_TIME; return 0; } @@ -2376,7 +2376,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) */ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) @@ -2396,10 +2396,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc return error; } - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { @@ -2451,6 +2451,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return -ENOMEM; } inode->i_op = &shmem_short_symlink_operations; + inode->i_link = info->symlink; } else { error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { @@ -2474,30 +2475,23 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); - return NULL; -} - -static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *shmem_follow_link(struct dentry *dentry, void **cookie) { struct page *page = NULL; - int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); - nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); - if (page) - unlock_page(page); - return page; + int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); + if (error) + return ERR_PTR(error); + unlock_page(page); + *cookie = page; + return kmap(page); } -static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void shmem_put_link(struct inode *unused, void *cookie) { - if (!IS_ERR(nd_get_link(nd))) { - struct page *page = cookie; - kunmap(page); - mark_page_accessed(page); - page_cache_release(page); - } + struct page *page = cookie; + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); } #ifdef CONFIG_TMPFS_XATTR @@ -2574,7 +2568,7 @@ static int shmem_xattr_validate(const char *name) static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); int err; /* @@ -2595,7 +2589,7 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, static int shmem_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); int err; /* @@ -2615,7 +2609,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, static int shmem_removexattr(struct dentry *dentry, const char *name) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); int err; /* @@ -2635,14 +2629,14 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(&info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .readlink = generic_readlink, - .follow_link = shmem_follow_short_symlink, + .follow_link = simple_follow_link, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, @@ -3118,8 +3112,6 @@ static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = shmem_file_read_iter, .write_iter = generic_file_write_iter, .fsync = noop_fsync, @@ -3371,8 +3363,8 @@ put_path: * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a - * higher layer. The one user is the big_key implementation. LSM checks - * are provided at the key level rather than the inode level. + * higher layer. The users are the big_key and shm implementations. LSM + * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size @@ -3403,7 +3395,13 @@ int shmem_zero_setup(struct vm_area_struct *vma) struct file *file; loff_t size = vma->vm_end - vma->vm_start; - file = shmem_file_setup("dev/zero", size, vma->vm_flags); + /* + * Cloning a new file under mmap_sem leads to a lock ordering conflict + * between XFS directory reading and selinux: since this file is only + * accessible to the user through its mapping, use S_PRIVATE flag to + * bypass file security, in the same way as shmem_kernel_file_setup(). + */ + file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/mm/slab.c b/mm/slab.c index c4b89eaf4c96..200e22412a16 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, return NULL; } +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return flags; +} + #else /* CONFIG_NUMA */ static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); @@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) return __cache_free_alien(cachep, objp, node, page_node); } + +/* + * Construct gfp mask to allocate from a specific node but do not invoke reclaim + * or warn about failures. + */ +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; +} #endif /* @@ -1440,6 +1454,7 @@ void __init kmem_cache_init(void) kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; + setup_kmalloc_cache_index_table(); slab_early_init = 0; @@ -2825,7 +2840,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; force_grow: - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3019,7 +3034,7 @@ retry: get_node(cache, nid) && get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (obj) break; } @@ -3047,7 +3062,7 @@ retry: nid = page_to_nid(page); if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (!obj) /* * Another processor may allocate the @@ -3118,7 +3133,7 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); if (x) goto retry; diff --git a/mm/slab.h b/mm/slab.h index 4c3ac12dd644..8da63e4e470f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags, #ifndef CONFIG_SLOB /* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(unsigned long); /* Find the kmalloc slab corresponding for a certain size */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 999bb3424d44..86831105a09f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -784,25 +783,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) } /* - * Create the kmalloc array. Some of the regular kmalloc arrays - * may already have been created because they were needed to - * enable allocations for slab creation. + * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is + * kmalloc-67108864. */ -void __init create_kmalloc_caches(unsigned long flags) +static struct { + const char *name; + unsigned long size; +} const kmalloc_info[] __initconst = { + {NULL, 0}, {"kmalloc-96", 96}, + {"kmalloc-192", 192}, {"kmalloc-8", 8}, + {"kmalloc-16", 16}, {"kmalloc-32", 32}, + {"kmalloc-64", 64}, {"kmalloc-128", 128}, + {"kmalloc-256", 256}, {"kmalloc-512", 512}, + {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, + {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, + {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, + {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, + {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, + {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, + {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, + {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, + {"kmalloc-67108864", 67108864} +}; + +/* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ +void __init setup_kmalloc_cache_index_table(void) { int i; - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); @@ -833,11 +852,26 @@ void __init create_kmalloc_caches(unsigned long flags) for (i = 128 + 8; i <= 192; i += 8) size_index[size_index_elem(i)] = 8; } +} + +static void __init new_kmalloc_cache(int idx, unsigned long flags) +{ + kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + kmalloc_info[idx].size, flags); +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache(NULL, - 1 << i, flags); - } + if (!kmalloc_caches[i]) + new_kmalloc_cache(i, flags); /* * Caches that are not of the two-to-the-power-of size. @@ -845,27 +879,14 @@ void __init create_kmalloc_caches(unsigned long flags) * earlier power of two caches */ if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); - + new_kmalloc_cache(1, flags); if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + new_kmalloc_cache(2, flags); } /* Kmalloc array is now usable */ slab_state = UP; - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - char *n; - - if (s) { - n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); - - BUG_ON(!n); - s->name = n; - } - } - #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { struct kmem_cache *s = kmalloc_caches[i]; diff --git a/mm/slob.c b/mm/slob.c index 94a7fede6d48..4765f65019c7 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) return 0; } -void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; @@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; } -EXPORT_SYMBOL(slob_alloc_node); void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { diff --git a/mm/slub.c b/mm/slub.c index 82c473780c91..816df0016555 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) - return 1; + return true; } else #endif { @@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page page->freelist = freelist_new; set_page_slub_counters(page, counters_new); slab_unlock(page); - return 1; + return true; } slab_unlock(page); } @@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, @@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) - return 1; + return true; } else #endif { @@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); - return 1; + return true; } slab_unlock(page); local_irq_restore(flags); @@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } #ifdef CONFIG_SLUB_DEBUG @@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; - if (tolower(*str) == 'o') { - /* - * Avoid enabling debugging on caches if its minimum order - * would increase as a result. - */ - disable_higher_order_debug = 1; - goto out; - } - slub_debug = 0; if (*str == '-') /* @@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str) case 'a': slub_debug |= SLAB_FAILSLAB; break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + disable_higher_order_debug = 1; + break; default: pr_err("slub_debug option '%c' unknown. skipped\n", *str); @@ -3702,6 +3700,7 @@ void __init kmem_cache_init(void) kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); #ifdef CONFIG_SMP @@ -4279,7 +4278,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int node; struct page *page; - page = ACCESS_ONCE(c->page); + page = READ_ONCE(c->page); if (!page) continue; @@ -4294,7 +4293,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; - page = ACCESS_ONCE(c->partial); + page = READ_ONCE(c->partial); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) diff --git a/mm/swap.c b/mm/swap.c index cd3a5e64cea9..a3a0a2f1f7c3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,6 +31,7 @@ #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> +#include <linux/hugetlb.h> #include "internal.h" @@ -42,7 +43,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page) { compound_page_dtor *dtor; - __page_cache_release(page); + /* + * __page_cache_release() is supposed to be called for thp, not for + * hugetlb. This is because hugetlb page does never have PageLRU set + * (it's never listed to any LRU lists) and no memcg routines should + * be called for hugetlb (it has a separate hugetlb_cgroup.) + */ + if (!PageHuge(page)) + __page_cache_release(page); dtor = get_compound_page_dtor(page); (*dtor)(page); } @@ -123,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page) * here, see the comment above this function. */ VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); if (put_page_testzero(page_head)) { /* * If this is the tail of a slab THP page, @@ -743,7 +750,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { int lru, file; @@ -811,36 +818,36 @@ void lru_add_drain_cpu(int cpu) local_irq_restore(flags); } - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); activate_page_drain(cpu); } /** - * deactivate_page - forcefully deactivate a page + * deactivate_file_page - forcefully deactivate a file page * @page: page to deactivate * * This function hints the VM that @page is a good reclaim candidate, * for example if its invalidation fails due to the page being dirty * or under writeback. */ -void deactivate_page(struct page *page) +void deactivate_file_page(struct page *page) { /* - * In a workload with many unevictable page such as mprotect, unevictable - * page deactivation for accelerating reclaim is pointless. + * In a workload with many unevictable page such as mprotect, + * unevictable page deactivation for accelerating reclaim is pointless. */ if (PageUnevictable(page)) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); if (!pagevec_add(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + put_cpu_var(lru_deactivate_file_pvecs); } } @@ -872,7 +879,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); diff --git a/mm/swap_state.c b/mm/swap_state.c index 405923f77334..8bc8e66138da 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) unsigned int pages, max_pages, last_ra; static atomic_t last_readahead_pages; - max_pages = 1 << ACCESS_ONCE(page_cluster); + max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; diff --git a/mm/swapfile.c b/mm/swapfile.c index 63f55ccb9b26..41e4581af7c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, else continue; } - count = ACCESS_ONCE(si->swap_map[i]); + count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; } @@ -2032,7 +2032,7 @@ static int swap_show(struct seq_file *swap, void *v) } file = si->swap_file; - len = seq_path(swap, &file->f_path, " \t\n\\"); + len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? diff --git a/mm/truncate.c b/mm/truncate.c index ddec5a5966d7..76e35ad97102 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset, } /* - * This cancels just the dirty bit on the kernel page itself, it - * does NOT actually remove dirty bits on any mmap's that may be - * around. It also leaves the page tagged dirty, so any sync - * activity will still find it on the dirty lists, and in particular, - * clear_page_dirty_for_io() will still look at the dirty bits in - * the VM. - * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Tought you could. - */ -void cancel_dirty_page(struct page *page, unsigned int account_size) -{ - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - if (account_size) - task_io_account_cancelled_write(account_size); - } - } -} -EXPORT_SYMBOL(cancel_dirty_page); - -/* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). @@ -140,8 +111,12 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_CACHE_SIZE); - + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + cancel_dirty_page(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -513,7 +488,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, * of interest and try to speed up its reclaim. */ if (!ret) - deactivate_page(page); + deactivate_file_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); @@ -535,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg; + unsigned long flags; + if (page->mapping != mapping) return 0; if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -555,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) page_cache_release(page); /* pagecache ref */ return 1; failed: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } diff --git a/mm/util.c b/mm/util.c index 3981ae9d1b15..68ff8a5361e7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -325,9 +325,37 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +static inline void *__page_rmapping(struct page *page) +{ + unsigned long mapping; + + mapping = (unsigned long)page->mapping; + mapping &= ~PAGE_MAPPING_FLAGS; + + return (void *)mapping; +} + +/* Neutral page->mapping pointer to address_space or anon_vma or other */ +void *page_rmapping(struct page *page) +{ + page = compound_head(page); + return __page_rmapping(page); +} + +struct anon_vma *page_anon_vma(struct page *page) +{ + unsigned long mapping; + + page = compound_head(page); + mapping = (unsigned long)page->mapping; + if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + return NULL; + return __page_rmapping(page); +} + struct address_space *page_mapping(struct page *page) { - struct address_space *mapping = page->mapping; + unsigned long mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page) swp_entry_t entry; entry.val = page_private(page); - mapping = swap_address_space(entry); - } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; + return swap_address_space(entry); + } + + mapping = (unsigned long)page->mapping; + if (mapping & PAGE_MAPPING_FLAGS) + return NULL; + return page->mapping; } int overcommit_ratio_handler(struct ctl_table *table, int write, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 49abccf29a29..2faaa2976447 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -29,6 +29,7 @@ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/llist.h> +#include <linux/bitops.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> @@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_clear_huge(pmd)) + continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next); @@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_clear_huge(pud)) + continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next); @@ -760,7 +765,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; - DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; @@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr) return addr; } -static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) +{ + unsigned long addr; + + addr = va_start + (pages_off << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); + return (void *)addr; +} + +/** + * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this + * block. Of course pages number can't exceed VMAP_BBMAP_BITS + * @order: how many 2^order pages should be occupied in newly allocated block + * @gfp_mask: flags for the page level allocator + * + * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) + */ +static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; unsigned long vb_idx; int node, err; + void *vaddr; node = numa_node_id(); @@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) return ERR_PTR(err); } + vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; - vb->free = VMAP_BBMAP_BITS; + /* At least something should be left free */ + BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; - bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + vb->dirty_min = VMAP_BBMAP_BITS; + vb->dirty_max = 0; INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); spin_lock(&vbq->lock); - list_add_rcu(&vb->free_list, &vbq->free); + list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_block_queue); - return vb; + return vaddr; } static void free_vmap_block(struct vmap_block *vb) @@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + vb->dirty_min = 0; + vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); @@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; - unsigned long addr = 0; + void *vaddr = NULL; unsigned int order; BUG_ON(size & ~PAGE_MASK); @@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) } order = get_order(size); -again: rcu_read_lock(); vbq = &get_cpu_var(vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i; + unsigned long pages_off; spin_lock(&vb->lock); - if (vb->free < 1UL << order) - goto next; + if (vb->free < (1UL << order)) { + spin_unlock(&vb->lock); + continue; + } - i = VMAP_BBMAP_BITS - vb->free; - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); + pages_off = VMAP_BBMAP_BITS - vb->free; + vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } + spin_unlock(&vb->lock); break; -next: - spin_unlock(&vb->lock); } put_cpu_var(vmap_block_queue); rcu_read_unlock(); - if (!addr) { - vb = new_vmap_block(gfp_mask); - if (IS_ERR(vb)) - return vb; - goto again; - } + /* Allocate new block if nothing was found */ + if (!vaddr) + vaddr = new_vmap_block(order, gfp_mask); - return (void *)addr; + return vaddr; } static void vb_free(const void *addr, unsigned long size) @@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size) order = get_order(size); offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + offset >>= PAGE_SHIFT; vb_idx = addr_to_vb_idx((unsigned long)addr); rcu_read_lock(); @@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); spin_lock(&vb->lock); - BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); + + /* Expand dirty range */ + vb->dirty_min = min(vb->dirty_min, offset); + vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { @@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i, j; - spin_lock(&vb->lock); - i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); - if (i < VMAP_BBMAP_BITS) { + if (vb->dirty) { + unsigned long va_start = vb->va->va_start; unsigned long s, e; - j = find_last_bit(vb->dirty_map, - VMAP_BBMAP_BITS); - j = j + 1; /* need exclusive index */ + s = va_start + (vb->dirty_min << PAGE_SHIFT); + e = va_start + (vb->dirty_max << PAGE_SHIFT); - s = vb->va->va_start + (i << PAGE_SHIFT); - e = vb->va->va_start + (j << PAGE_SHIFT); - flush = 1; + start = min(s, start); + end = max(e, end); - if (s < start) - start = s; - if (e > end) - end = e; + flush = 1; } spin_unlock(&vb->lock); } @@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) - align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); + align = 1ul << clamp_t(int, fls_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e8eadd71bac..8286938c70de 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } + +/** + * sane_reclaim - is the usual dirty throttling mechanism operational? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_page_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool sane_reclaim(struct scan_control *sc) +{ + struct mem_cgroup *memcg = sc->target_mem_cgroup; + + if (!memcg) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) + return true; +#endif + return false; +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool sane_reclaim(struct scan_control *sc) +{ + return true; +} #endif static unsigned long zone_reclaimable_pages(struct zone *zone) @@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi, - struct scan_control *sc) +static int may_write_to_inode(struct inode *inode, struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(inode)) return 1; - if (bdi == current->backing_dev_info) + if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; } @@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) + if (!may_write_to_inode(mapping->host, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { + unsigned long flags; + struct mem_cgroup *memcg; + BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. * @@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, shadow, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } @@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(inode_to_bdi(mapping->host))) || + inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) nr_congested++; @@ -935,24 +972,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * - * 2) Global reclaim encounters a page, memcg encounters a - * page that is not marked for immediate reclaim or - * the caller does not have __GFP_IO. In this case mark - * the page for immediate reclaim and continue scanning. + * 2) Global or new memcg reclaim encounters a page that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate + * reclaim and continue scanning. * - * __GFP_IO is checked because a loop driver thread might + * Require may_enter_fs because we would wait on fs, which + * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * Don't require __GFP_FS, since we're not going into the - * FS, just waiting on its writeback completion. Worryingly, - * ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing - * may_enter_fs here is liable to OOM on them. - * - * 3) memcg encounters a page that is not already marked + * 3) Legacy memcg encounters a page that is not already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to @@ -967,8 +1000,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Case 2 above */ - } else if (global_reclaim(sc) || - !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + } else if (sane_reclaim(sc) || + !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then @@ -1416,7 +1449,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!global_reclaim(sc)) + if (!sane_reclaim(sc)) return 0; if (file) { @@ -1608,10 +1641,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, set_bit(ZONE_WRITEBACK, &zone->flags); /* - * memcg will stall in page writeback so only consider forcibly - * stalling for global reclaim + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling here. */ - if (global_reclaim(sc)) { + if (sane_reclaim(sc)) { /* * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. @@ -2646,7 +2679,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -3596,7 +3630,7 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -3638,12 +3672,12 @@ static long zone_pagecache_reclaimable(struct zone *zone) long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and zone_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) + if (zone_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); else nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); @@ -3674,15 +3708,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .order = order, .priority = ZONE_RECLAIM_PRIORITY, .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, }; cond_resched(); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP + * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * and RECLAIM_UNMAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); diff --git a/mm/zbud.c b/mm/zbud.c index 2ee4e4520493..f3bf6f7627d8 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -97,6 +97,10 @@ struct zbud_pool { struct list_head lru; u64 pages_nr; struct zbud_ops *ops; +#ifdef CONFIG_ZPOOL + struct zpool *zpool; + struct zpool_ops *zpool_ops; +#endif }; /* @@ -123,7 +127,10 @@ struct zbud_header { static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) { - return zpool_evict(pool, handle); + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; } static struct zbud_ops zbud_zpool_ops = { @@ -131,9 +138,17 @@ static struct zbud_ops zbud_zpool_ops = { }; static void *zbud_zpool_create(char *name, gfp_t gfp, - struct zpool_ops *zpool_ops) + struct zpool_ops *zpool_ops, + struct zpool *zpool) { - return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; } static void zbud_zpool_destroy(void *pool) @@ -292,7 +307,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) struct zbud_pool *pool; int i; - pool = kmalloc(sizeof(struct zbud_pool), gfp); + pool = kzalloc(sizeof(struct zbud_pool), gfp); if (!pool) return NULL; spin_lock_init(&pool->lock); diff --git a/mm/zpool.c b/mm/zpool.c index bacdab6e47de..722a4f60e90b 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -73,33 +73,6 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); -/** - * zpool_evict() - evict callback from a zpool implementation. - * @pool: pool to evict from. - * @handle: handle to evict. - * - * This can be used by zpool implementations to call the - * user's evict zpool_ops struct evict callback. - */ -int zpool_evict(void *pool, unsigned long handle) -{ - struct zpool *zpool; - - spin_lock(&pools_lock); - list_for_each_entry(zpool, &pools_head, list) { - if (zpool->pool == pool) { - spin_unlock(&pools_lock); - if (!zpool->ops || !zpool->ops->evict) - return -EINVAL; - return zpool->ops->evict(zpool, handle); - } - } - spin_unlock(&pools_lock); - - return -ENOENT; -} -EXPORT_SYMBOL(zpool_evict); - static struct zpool_driver *zpool_get_driver(char *type) { struct zpool_driver *driver; @@ -147,7 +120,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, struct zpool_driver *driver; struct zpool *zpool; - pr_info("creating pool type %s\n", type); + pr_debug("creating pool type %s\n", type); driver = zpool_get_driver(type); @@ -170,7 +143,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, zpool->type = driver->type; zpool->driver = driver; - zpool->pool = driver->create(name, gfp, ops); + zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; if (!zpool->pool) { @@ -180,7 +153,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - pr_info("created %s pool\n", type); + pr_debug("created pool type %s\n", type); spin_lock(&pools_lock); list_add(&zpool->list, &pools_head); @@ -202,7 +175,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_info("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->type); spin_lock(&pools_lock); list_del(&zpool->list); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0dec1fa5f656..0a7f81aa2249 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -12,35 +12,6 @@ */ /* - * This allocator is designed for use with zram. Thus, the allocator is - * supposed to work well under low memory conditions. In particular, it - * never attempts higher order page allocation which is very likely to - * fail under memory pressure. On the other hand, if we just use single - * (0-order) pages, it would suffer from very high fragmentation -- - * any object of size PAGE_SIZE/2 or larger would occupy an entire page. - * This was one of the major issues with its predecessor (xvmalloc). - * - * To overcome these issues, zsmalloc allocates a bunch of 0-order pages - * and links them together using various 'struct page' fields. These linked - * pages act as a single higher-order page i.e. an object can span 0-order - * page boundaries. The code refers to these linked pages as a single entity - * called zspage. - * - * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE - * since this satisfies the requirements of all its current users (in the - * worst case, page is incompressible and is thus stored "as-is" i.e. in - * uncompressed form). For allocation requests larger than this size, failure - * is returned (see zs_malloc). - * - * Additionally, zs_malloc() does not return a dereferenceable pointer. - * Instead, it returns an opaque handle (unsigned long) which encodes actual - * location of the allocated object. The reason for this indirection is that - * zsmalloc does not keep zspages permanently mapped since that would cause - * issues on 32-bit systems where the VA region for kernel space mappings - * is very small. So, before using the allocating memory, the object has to - * be mapped using zs_map_object() to get a usable pointer and subsequently - * unmapped using zs_unmap_object(). - * * Following is how we use various fields and flags of underlying * struct page(s) to form a zspage. * @@ -57,6 +28,8 @@ * * page->private (union with page->first_page): refers to the * component page after the first page + * If the page is first_page for huge object, it stores handle. + * Look at size_class->huge. * page->freelist: points to the first free object in zspage. * Free objects are linked together using in-place * metadata. @@ -72,12 +45,9 @@ * */ -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> @@ -110,6 +80,8 @@ #define ZS_MAX_ZSPAGE_ORDER 2 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) +#define ZS_HANDLE_SIZE (sizeof(unsigned long)) + /* * Object location (<PFN>, <obj_idx>) is encoded as * as single (unsigned long) handle value. @@ -133,13 +105,33 @@ #endif #endif #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) + +/* + * Memory for allocating for handle keeps object position by + * encoding <page, obj_idx> and the encoded value has a room + * in least bit(ie, look at obj_to_location). + * We use the bit to synchronize between object access by + * user and migration. + */ +#define HANDLE_PIN_BIT 0 + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define MAX(a, b) ((a) >= (b) ? (a) : (b)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +/* each chunk includes extra space to keep handle */ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* @@ -172,6 +164,8 @@ enum fullness_group { enum zs_stat_type { OBJ_ALLOCATED, OBJ_USED, + CLASS_ALMOST_FULL, + CLASS_ALMOST_EMPTY, NR_ZS_STAT_TYPE, }; @@ -216,6 +210,8 @@ struct size_class { /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; #ifdef CONFIG_ZSMALLOC_STAT struct zs_size_stat stats; @@ -233,14 +229,24 @@ struct size_class { * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { - /* Handle of next free chunk (encodes <PFN, obj_idx>) */ - void *next; + union { + /* + * Position of next free chunk (encodes <PFN, obj_idx>) + * It's valid for non-allocated object + */ + void *next; + /* + * Handle of allocated object. + */ + unsigned long handle; + }; }; struct zs_pool { char *name; struct size_class **size_class; + struct kmem_cache *handle_cachep; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; @@ -267,13 +273,44 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ + bool huge; }; +static int create_handle_cache(struct zs_pool *pool) +{ + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + return pool->handle_cachep ? 0 : 1; +} + +static void destroy_handle_cache(struct zs_pool *pool) +{ + if (pool->handle_cachep) + kmem_cache_destroy(pool->handle_cachep); +} + +static unsigned long alloc_handle(struct zs_pool *pool) +{ + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, + pool->flags & ~__GFP_HIGHMEM); +} + +static void free_handle(struct zs_pool *pool, unsigned long handle) +{ + kmem_cache_free(pool->handle_cachep, (void *)handle); +} + +static void record_obj(unsigned long handle, unsigned long obj) +{ + *(unsigned long *)handle = obj; +} + /* zpool driver */ #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops, + struct zpool *zpool) { return zs_create_pool(name, gfp); } @@ -346,6 +383,11 @@ static struct zpool_driver zs_zpool_driver = { MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -396,9 +438,182 @@ static int get_size_class_index(int size) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); - return idx; + return min(zs_size_classes - 1, idx); +} + +#ifdef CONFIG_ZSMALLOC_STAT + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] -= cnt; } +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return class->stats.objs[type]; +} + +static int __init zs_stat_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + return -ENOMEM; + + return 0; +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long class_almost_full, class_almost_empty; + unsigned long obj_allocated, obj_used, pages_used; + unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + "class", "size", "almost_full", "almost_empty", + "obj_allocated", "obj_used", "pages_used", + "pages_per_zspage"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); + class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + i, class->size, class_almost_full, class_almost_empty, + obj_allocated, obj_used, pages_used, + class->pages_per_zspage); + + total_class_almost_full += class_almost_full; + total_class_almost_empty += class_almost_empty; + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + "Total", "", total_class_almost_full, + total_class_almost_empty, total_objs, + total_used_objs, total_pages); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + struct dentry *entry; + + if (!zs_stat_root) + return -ENODEV; + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return -ENOMEM; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "classes"); + return -ENOMEM; + } + + return 0; +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return 0; +} + +static int __init zs_stat_init(void) +{ + return 0; +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + return 0; +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ +} + +#endif + + /* * For each size class, zspages are divided into different groups * depending on how "full" they are. This was done so that we could @@ -419,7 +634,7 @@ static enum fullness_group get_fullness_group(struct page *page) fg = ZS_EMPTY; else if (inuse == max_objects) fg = ZS_FULL; - else if (inuse <= max_objects / fullness_threshold_frac) + else if (inuse <= 3 * max_objects / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; @@ -448,6 +663,8 @@ static void insert_zspage(struct page *page, struct size_class *class, list_add_tail(&page->lru, &(*head)->lru); *head = page; + zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } /* @@ -473,6 +690,8 @@ static void remove_zspage(struct page *page, struct size_class *class, struct page, lru); list_del_init(&page->lru); + zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } /* @@ -484,11 +703,10 @@ static void remove_zspage(struct page *page, struct size_class *class, * page from the freelist of the old fullness group to that of the new * fullness group. */ -static enum fullness_group fix_fullness_group(struct zs_pool *pool, +static enum fullness_group fix_fullness_group(struct size_class *class, struct page *page) { int class_idx; - struct size_class *class; enum fullness_group currfg, newfg; BUG_ON(!is_first_page(page)); @@ -498,7 +716,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, if (newfg == currfg) goto out; - class = pool->size_class[class_idx]; remove_zspage(page, class, currfg); insert_zspage(page, class, newfg); set_zspage_mapping(page, class_idx, newfg); @@ -512,7 +729,8 @@ out: * to form a zspage for each size class. This is important * to reduce wastage due to unusable space left at end of * each zspage which is given as: - * wastage = Zp - Zp % size_class + * wastage = Zp % class_size + * usage = Zp - wastage * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... * * For example, for size class of 3/8 * PAGE_SIZE, we should @@ -571,35 +789,50 @@ static struct page *get_next_page(struct page *page) /* * Encode <page, obj_idx> as a single handle value. - * On hardware platforms with physical memory starting at 0x0 the pfn - * could be 0 so we ensure that the handle will never be 0 by adjusting the - * encoded obj_idx value before encoding. + * We use the least bit of handle for tagging. */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +static void *location_to_obj(struct page *page, unsigned long obj_idx) { - unsigned long handle; + unsigned long obj; if (!page) { BUG_ON(obj_idx); return NULL; } - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj <<= OBJ_TAG_BITS; - return (void *)handle; + return (void *)obj; } /* * Decode <page, obj_idx> pair from the given object handle. We adjust the * decoded obj_idx back to its original value since it was adjusted in - * obj_location_to_handle(). + * location_to_obj(). */ -static void obj_handle_to_location(unsigned long handle, struct page **page, +static void obj_to_location(unsigned long obj, struct page **page, unsigned long *obj_idx) { - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = (handle & OBJ_INDEX_MASK) - 1; + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); +} + +static unsigned long handle_to_obj(unsigned long handle) +{ + return *(unsigned long *)handle; +} + +static unsigned long obj_to_head(struct size_class *class, struct page *page, + void *obj) +{ + if (class->huge) { + VM_BUG_ON(!is_first_page(page)); + return *(unsigned long *)page_private(page); + } else + return *(unsigned long *)obj; } static unsigned long obj_idx_to_offset(struct page *page, @@ -613,6 +846,25 @@ static unsigned long obj_idx_to_offset(struct page *page, return off + obj_idx * class_size; } +static inline int trypin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); +} + +static void pin_tag(unsigned long handle) +{ + while (!trypin_tag(handle)); +} + +static void unpin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + clear_bit_unlock(HANDLE_PIN_BIT, ptr); +} + static void reset_page(struct page *page) { clear_bit(PG_private, &page->flags); @@ -674,7 +926,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i++); + link->next = location_to_obj(page, i++); link += class->size / sizeof(*link); } @@ -684,7 +936,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) * page (if present) */ next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); + link->next = location_to_obj(next_page, 0); kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; @@ -738,7 +990,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) init_zspage(first_page, class); - first_page->freelist = obj_location_to_handle(first_page, 0); + first_page->freelist = location_to_obj(first_page, 0); /* Maximum number of objects we can store in this zspage */ first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; @@ -860,12 +1112,19 @@ static void __zs_unmap_object(struct mapping_area *area, { int sizes[2]; void *addr; - char *buf = area->vm_buf; + char *buf; /* no write fastpath */ if (area->vm_mm == ZS_MM_RO) goto out; + buf = area->vm_buf; + if (!area->huge) { + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + } + sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -952,11 +1211,6 @@ static void init_zs_size_classes(void) zs_size_classes = nr; } -static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) -{ - return pages_per_zspage * PAGE_SIZE / size; -} - static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) { if (prev->pages_per_zspage != pages_per_zspage) @@ -969,166 +1223,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } -#ifdef CONFIG_ZSMALLOC_STAT - -static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ - class->stats.objs[type] += cnt; -} - -static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ - class->stats.objs[type] -= cnt; -} - -static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) -{ - return class->stats.objs[type]; -} - -static int __init zs_stat_init(void) -{ - if (!debugfs_initialized()) - return -ENODEV; - - zs_stat_root = debugfs_create_dir("zsmalloc", NULL); - if (!zs_stat_root) - return -ENOMEM; - - return 0; -} - -static void __exit zs_stat_exit(void) -{ - debugfs_remove_recursive(zs_stat_root); -} - -static int zs_stats_size_show(struct seq_file *s, void *v) +static bool zspage_full(struct page *page) { - int i; - struct zs_pool *pool = s->private; - struct size_class *class; - int objs_per_zspage; - unsigned long obj_allocated, obj_used, pages_used; - unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; - - seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", - "obj_allocated", "obj_used", "pages_used"); - - for (i = 0; i < zs_size_classes; i++) { - class = pool->size_class[i]; - - if (class->index != i) - continue; - - spin_lock(&class->lock); - obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); - obj_used = zs_stat_get(class, OBJ_USED); - spin_unlock(&class->lock); - - objs_per_zspage = get_maxobj_per_zspage(class->size, - class->pages_per_zspage); - pages_used = obj_allocated / objs_per_zspage * - class->pages_per_zspage; - - seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, - class->size, obj_allocated, obj_used, pages_used); - - total_objs += obj_allocated; - total_used_objs += obj_used; - total_pages += pages_used; - } - - seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", - total_objs, total_used_objs, total_pages); - - return 0; -} - -static int zs_stats_size_open(struct inode *inode, struct file *file) -{ - return single_open(file, zs_stats_size_show, inode->i_private); -} - -static const struct file_operations zs_stat_size_ops = { - .open = zs_stats_size_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int zs_pool_stat_create(char *name, struct zs_pool *pool) -{ - struct dentry *entry; - - if (!zs_stat_root) - return -ENODEV; - - entry = debugfs_create_dir(name, zs_stat_root); - if (!entry) { - pr_warn("debugfs dir <%s> creation failed\n", name); - return -ENOMEM; - } - pool->stat_dentry = entry; - - entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, - pool->stat_dentry, pool, &zs_stat_size_ops); - if (!entry) { - pr_warn("%s: debugfs file entry <%s> creation failed\n", - name, "obj_in_classes"); - return -ENOMEM; - } - - return 0; -} - -static void zs_pool_stat_destroy(struct zs_pool *pool) -{ - debugfs_remove_recursive(pool->stat_dentry); -} - -#else /* CONFIG_ZSMALLOC_STAT */ - -static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) -{ - return 0; -} - -static int __init zs_stat_init(void) -{ - return 0; -} - -static void __exit zs_stat_exit(void) -{ -} - -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) -{ - return 0; -} + BUG_ON(!is_first_page(page)); -static inline void zs_pool_stat_destroy(struct zs_pool *pool) -{ + return page->inuse == page->objects; } -#endif - unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); @@ -1153,13 +1254,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; + void *ret; BUG_ON(!handle); @@ -1170,7 +1272,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - obj_handle_to_location(handle, &page, &obj_idx); + /* From now on, migration cannot move the object */ + pin_tag(handle); + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); @@ -1180,7 +1286,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; + ret = area->vm_addr + off; + goto out; } /* this object spans two pages */ @@ -1188,14 +1295,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, pages[1] = get_next_page(page); BUG_ON(!pages[1]); - return __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, pages, off, class->size); +out: + if (!class->huge) + ret += ZS_HANDLE_SIZE; + + return ret; } EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; @@ -1204,7 +1316,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) BUG_ON(!handle); - obj_handle_to_location(handle, &page, &obj_idx); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); @@ -1222,9 +1335,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); +static unsigned long obj_malloc(struct page *first_page, + struct size_class *class, unsigned long handle) +{ + unsigned long obj; + struct link_free *link; + + struct page *m_page; + unsigned long m_objidx, m_offset; + void *vaddr; + + handle |= OBJ_ALLOCATED_TAG; + obj = (unsigned long)first_page->freelist; + obj_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + first_page->freelist = link->next; + if (!class->huge) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else + /* record handle in first_page->private */ + set_page_private(first_page, handle); + kunmap_atomic(vaddr); + first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); + + return obj; +} + + /** * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from @@ -1236,17 +1382,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); */ unsigned long zs_malloc(struct zs_pool *pool, size_t size) { - unsigned long obj; - struct link_free *link; + unsigned long handle, obj; struct size_class *class; - void *vaddr; - - struct page *first_page, *m_page; - unsigned long m_objidx, m_offset; + struct page *first_page; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; + handle = alloc_handle(pool); + if (!handle) + return 0; + + /* extra space in chunk to keep the handle */ + size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); @@ -1255,8 +1403,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); first_page = alloc_zspage(class, pool->flags); - if (unlikely(!first_page)) + if (unlikely(!first_page)) { + free_handle(pool, handle); return 0; + } set_zspage_mapping(first_page, class->index, ZS_EMPTY); atomic_long_add(class->pages_per_zspage, @@ -1267,73 +1417,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) class->size, class->pages_per_zspage)); } - obj = (unsigned long)first_page->freelist; - obj_handle_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - - vaddr = kmap_atomic(m_page); - link = (struct link_free *)vaddr + m_offset / sizeof(*link); - first_page->freelist = link->next; - memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(vaddr); - - first_page->inuse++; - zs_stat_inc(class, OBJ_USED, 1); + obj = obj_malloc(first_page, class, handle); /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(pool, first_page); + fix_fullness_group(class, first_page); + record_obj(handle, obj); spin_unlock(&class->lock); - return obj; + return handle; } EXPORT_SYMBOL_GPL(zs_malloc); -void zs_free(struct zs_pool *pool, unsigned long obj) +static void obj_free(struct zs_pool *pool, struct size_class *class, + unsigned long obj) { struct link_free *link; struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - struct size_class *class; enum fullness_group fullness; - if (unlikely(!obj)) - return; + BUG_ON(!obj); - obj_handle_to_location(obj, &f_page, &f_objidx); + obj &= ~OBJ_ALLOCATED_TAG; + obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); get_zspage_mapping(first_page, &class_idx, &fullness); - class = pool->size_class[class_idx]; f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); - spin_lock(&class->lock); + vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ - vaddr = kmap_atomic(f_page); link = (struct link_free *)(vaddr + f_offset); link->next = first_page->freelist; + if (class->huge) + set_page_private(first_page, 0); kunmap_atomic(vaddr); first_page->freelist = (void *)obj; - first_page->inuse--; - fullness = fix_fullness_group(pool, first_page); - zs_stat_dec(class, OBJ_USED, 1); - if (fullness == ZS_EMPTY) +} + +void zs_free(struct zs_pool *pool, unsigned long handle) +{ + struct page *first_page, *f_page; + unsigned long obj, f_objidx; + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!handle)) + return; + + pin_tag(handle); + obj = handle_to_obj(handle); + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + obj_free(pool, class, obj); + fullness = fix_fullness_group(class, first_page); + if (fullness == ZS_EMPTY) { zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( class->size, class->pages_per_zspage)); - + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + free_zspage(first_page); + } spin_unlock(&class->lock); + unpin_tag(handle); + + free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(unsigned long src, unsigned long dst, + struct size_class *class) +{ + struct page *s_page, *d_page; + unsigned long s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + s_off += size; + s_size -= size; + d_off += size; + d_size -= size; + + if (s_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + BUG_ON(!s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } + + if (d_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + BUG_ON(!d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct page *page, int index, + struct size_class *class) +{ + unsigned long head; + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + if (!is_first_page(page)) + offset = page->index; + offset += class->size * index; + + while (offset < PAGE_SIZE) { + head = obj_to_head(class, page, addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) + break; + handle = 0; + } + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + return handle; +} + +struct zs_compact_control { + /* Source page for migration which could be a subpage of zspage. */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int index; + /* how many of objects are migrated */ + int nr_migrated; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + unsigned long index = cc->index; + int nr_migrated = 0; + int ret = 0; + + while (1) { + handle = find_alloced_obj(s_page, index, class); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + index = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(d_page)) { + unpin_tag(handle); + ret = -ENOMEM; + break; + } + + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(d_page, class, handle); + zs_object_copy(used_obj, free_obj, class); + index++; + record_obj(handle, free_obj); + unpin_tag(handle); + obj_free(pool, class, used_obj); + nr_migrated++; + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->index = index; + cc->nr_migrated = nr_migrated; + + return ret; +} + +static struct page *alloc_target_page(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) { + remove_zspage(page, class, i); + break; + } + } + + return page; +} + +static void putback_zspage(struct zs_pool *pool, struct size_class *class, + struct page *first_page) +{ + enum fullness_group fullness; + + BUG_ON(!is_first_page(first_page)); + + fullness = get_fullness_group(first_page); + insert_zspage(first_page, class, fullness); + set_zspage_mapping(first_page, class->index, fullness); if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); + free_zspage(first_page); } } -EXPORT_SYMBOL_GPL(zs_free); + +static struct page *isolate_source_page(struct size_class *class) +{ + struct page *page; + + page = class->fullness_list[ZS_ALMOST_EMPTY]; + if (page) + remove_zspage(page, class, ZS_ALMOST_EMPTY); + + return page; +} + +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) +{ + int nr_to_migrate; + struct zs_compact_control cc; + struct page *src_page; + struct page *dst_page = NULL; + unsigned long nr_total_migrated = 0; + + spin_lock(&class->lock); + while ((src_page = isolate_source_page(class))) { + + BUG_ON(!is_first_page(src_page)); + + /* The goal is to migrate all live objects in source page */ + nr_to_migrate = src_page->inuse; + cc.index = 0; + cc.s_page = src_page; + + while ((dst_page = alloc_target_page(class))) { + cc.d_page = dst_page; + /* + * If there is no more space in dst_page, try to + * allocate another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(pool, class, dst_page); + nr_total_migrated += cc.nr_migrated; + nr_to_migrate -= cc.nr_migrated; + } + + /* Stop if we couldn't find slot */ + if (dst_page == NULL) + break; + + putback_zspage(pool, class, dst_page); + putback_zspage(pool, class, src_page); + spin_unlock(&class->lock); + nr_total_migrated += cc.nr_migrated; + cond_resched(); + spin_lock(&class->lock); + } + + if (src_page) + putback_zspage(pool, class, src_page); + + spin_unlock(&class->lock); + + return nr_total_migrated; +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + unsigned long nr_migrated = 0; + struct size_class *class; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + nr_migrated += __zs_compact(pool, class); + } + + return nr_migrated; +} +EXPORT_SYMBOL_GPL(zs_compact); /** * zs_create_pool - Creates an allocation pool to work from. @@ -1355,20 +1792,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) if (!pool) return NULL; - pool->name = kstrdup(name, GFP_KERNEL); - if (!pool->name) { - kfree(pool); - return NULL; - } - pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { - kfree(pool->name); kfree(pool); return NULL; } + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + + if (create_handle_cache(pool)) + goto err; + /* * Iterate reversly, because, size of size_class that we want to use * for merging should be larger or equal to current size. @@ -1406,6 +1843,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; + if (pages_per_zspage == 1 && + get_maxobj_per_zspage(size, pages_per_zspage) == 1) + class->huge = true; spin_lock_init(&class->lock); pool->size_class[i] = class; @@ -1450,6 +1890,7 @@ void zs_destroy_pool(struct zs_pool *pool) kfree(class); } + destroy_handle_cache(pool); kfree(pool->size_class); kfree(pool->name); kfree(pool); diff --git a/mm/zswap.c b/mm/zswap.c index 4249e82ff934..2d5727baed59 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -75,9 +75,10 @@ static u64 zswap_duplicate_entry; /********************************* * tunables **********************************/ -/* Enable/disable zswap (disabled by default, fixed at boot for now) */ -static bool zswap_enabled __read_mostly; -module_param_named(enabled, zswap_enabled, bool, 0444); + +/* Enable/disable zswap (disabled by default) */ +static bool zswap_enabled; +module_param_named(enabled, zswap_enabled, bool, 0644); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" @@ -648,7 +649,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, u8 *src, *dst; struct zswap_header *zhdr; - if (!tree) { + if (!zswap_enabled || !tree) { ret = -ENODEV; goto reject; } @@ -901,9 +902,6 @@ static int __init init_zswap(void) { gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; - if (!zswap_enabled) - return 0; - pr_info("loading zswap\n"); zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, |