diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 527 |
1 files changed, 326 insertions, 201 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b34ef4a32a3b..acb93c554f6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -14,6 +14,12 @@ * Copyright (C) 2012 Parallels Inc. and Google Inc. * Authors: Glauber Costa and Suleiman Souhlal * + * Native page reclaim + * Charge lifetime sanitation + * Lockless page tracking & accounting + * Unified hierarchy configuration model + * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -71,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; +struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -84,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = { "rss", "rss_huge", "mapped_file", + "dirty", "writeback", "swap", }; @@ -253,11 +261,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. - * - * TODO: Add a water mark for the memory controller. Reclaim will begin when - * we hit the water mark. May be even add a low water mark, such that - * no reclaim occurs from a cgroup at it's low water mark, this is - * a feature that will be implemented much later in the future. */ struct mem_cgroup { struct cgroup_subsys_state css; @@ -284,9 +287,9 @@ struct mem_cgroup { */ bool use_hierarchy; + /* protected by memcg_oom_lock */ bool oom_lock; - atomic_t under_oom; - atomic_t oom_wakeups; + int under_oom; int swappiness; /* OOM-Killer disable */ @@ -321,11 +324,6 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; - /* - * used when a cpu is offlined or other synchronizations - * See mem_cgroup_read_stat(). - */ - struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) @@ -345,6 +343,11 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; + struct wb_domain cgwb_domain; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -454,6 +457,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) return memcg->css.id; } +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) + */ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; @@ -589,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) return &memcg->css; } +/** + * mem_cgroup_css_from_page - css of the memcg associated with a page + * @page: page of interest + * + * If memcg is bound to the default hierarchy, css of the memcg associated + * with @page is returned. The returned css remains associated with @page + * until it is released. + * + * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup + * is returned. + * + * XXX: The above description of behavior on the default hierarchy isn't + * strictly true yet as replace_page_cache_page() can modify the + * association before @page is released even on the default hierarchy; + * however, the current and planned usages don't mix the the two functions + * and replace_page_cache_page() will soon be updated to make the invariant + * actually true. + */ +struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + + memcg = page->mem_cgroup; + + if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) + memcg = root_mem_cgroup; + + rcu_read_unlock(); + return &memcg->css; +} + static struct mem_cgroup_per_zone * mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { @@ -667,7 +709,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; if (nr_pages > soft_limit) @@ -788,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->count[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.count[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -806,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.events[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -1035,7 +1063,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, goto out_unlock; do { - pos = ACCESS_ONCE(iter->position); + pos = READ_ONCE(iter->position); /* * A racing update may change the position and * put the last reference, hence css_tryget(), @@ -1352,13 +1380,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = ACCESS_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.limit); if (count < limit) margin = limit - count; if (do_swap_account) { count = page_counter_read(&memcg->memsw); - limit = ACCESS_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) margin = min(margin, limit - count); } @@ -1436,15 +1464,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) struct mem_cgroup *iter; unsigned int i; - if (!p) - return; - mutex_lock(&oom_info_lock); rcu_read_lock(); - pr_info("Task in "); - pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); + if (p) { + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_cont(" killed as a result of limit of "); + } else { + pr_info("Memory limit reached of cgroup "); + } + pr_cont_cgroup_path(memcg->css.cgroup); pr_cont("\n"); @@ -1521,17 +1551,19 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int points = 0; struct task_struct *chosen = NULL; + mutex_lock(&oom_lock); + /* * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - mark_tsk_oom_victim(current); - return; + mark_oom_victim(current); + goto unlock; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1555,7 +1587,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); - return; + goto unlock; case OOM_SCAN_OK: break; }; @@ -1576,11 +1608,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_end(&it); } - if (!chosen) - return; - points = chosen_points * 1000 / totalpages; - oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, - NULL, "Memory cgroup out of memory"); + if (chosen) { + points = chosen_points * 1000 / totalpages; + oom_kill_process(chosen, gfp_mask, order, points, totalpages, + memcg, NULL, "Memory cgroup out of memory"); + } +unlock: + mutex_unlock(&oom_lock); } #if MAX_NUMNODES > 1 @@ -1797,8 +1831,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_inc(&iter->under_oom); + iter->under_oom++; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) @@ -1807,11 +1843,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) /* * When a new child is created while the hierarchy is under oom, - * mem_cgroup_oom_lock() may not be called. We have to use - * atomic_add_unless() here. + * mem_cgroup_oom_lock() may not be called. Watch for underflow. */ + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_add_unless(&iter->under_oom, -1, 0); + if (iter->under_oom > 0) + iter->under_oom--; + spin_unlock(&memcg_oom_lock); } static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1837,17 +1875,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait, return autoremove_wake_function(wait, mode, sync, arg); } -static void memcg_wakeup_oom(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->oom_wakeups); - /* for filtering, pass "memcg" as argument. */ - __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); -} - static void memcg_oom_recover(struct mem_cgroup *memcg) { - if (memcg && atomic_read(&memcg->under_oom)) - memcg_wakeup_oom(memcg); + /* + * For the following lockless ->under_oom test, the only required + * guarantee is that it must see the state asserted by an OOM when + * this function is called as a result of userland actions + * triggered by the notification of the OOM. This is trivially + * achieved by invoking mem_cgroup_mark_under_oom() before + * triggering notification. + */ + if (memcg && memcg->under_oom) + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) @@ -2002,6 +2041,7 @@ again: return memcg; } +EXPORT_SYMBOL(mem_cgroup_begin_page_stat); /** * mem_cgroup_end_page_stat - finish a page state statistics transaction @@ -2020,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } +EXPORT_SYMBOL(mem_cgroup_end_page_stat); /** * mem_cgroup_update_page_stat - update page state statistics @@ -2160,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -/* - * This function drains percpu counter value from DEAD cpu and - * move it to local cpu. Note that this function can be preempted. - */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) -{ - int i; - - spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long x = per_cpu(memcg->stat->count[i], cpu); - - per_cpu(memcg->stat->count[i], cpu) = 0; - memcg->nocpu_base.count[i] += x; - } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(memcg->stat->events[i], cpu); - - per_cpu(memcg->stat->events[i], cpu) = 0; - memcg->nocpu_base.events[i] += x; - } - spin_unlock(&memcg->pcp_counter_lock); -} - static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - struct mem_cgroup *iter; if (action == CPU_ONLINE) return NOTIFY_OK; @@ -2198,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup(iter) - mem_cgroup_drain_pcp_counter(iter, cpu); - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; @@ -2314,6 +2327,8 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + if (!(gfp_mask & __GFP_WAIT)) + goto done; /* * If the hierarchy is above the normal consumption range, * make the charging task trim their excess contribution. @@ -2341,20 +2356,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) } /* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) -{ - /* ID 0 is unused ID */ - if (!id) - return NULL; - return mem_cgroup_from_id(id); -} - -/* * try_get_mem_cgroup_from_page - look up page's memcg association * @page: the page * @@ -2380,7 +2381,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); + memcg = mem_cgroup_from_id(id); if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); @@ -2642,7 +2643,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) return cachep; memcg = get_mem_cgroup_from_mm(current->mm); - kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); + kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2779,92 +2780,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * mem_cgroup_move_account - move account of the page - * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. - * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 - * - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" - * from old cgroup. - */ -static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, - struct mem_cgroup *from, - struct mem_cgroup *to) -{ - unsigned long flags; - int ret; - - VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; - - /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. - */ - if (!trylock_page(page)) - goto out; - - ret = -EINVAL; - if (page->mem_cgroup != from) - goto out_unlock; - - spin_lock_irqsave(&from->move_lock, flags); - - if (!PageAnon(page) && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - } - - if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - } - - /* - * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. - */ - - /* caller should have done css_get */ - page->mem_cgroup = to; - spin_unlock_irqrestore(&from->move_lock, flags); - - ret = 0; - - local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); - local_irq_enable(); -out_unlock: - unlock_page(page); -out: - return ret; -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -3953,7 +3868,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->under_oom)) + if (memcg->under_oom) eventfd_signal(eventfd, 1); spin_unlock(&memcg_oom_lock); @@ -3982,7 +3897,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); - seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); + seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); return 0; } @@ -4084,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) +{ + return &memcg->cgwb_list; +} + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +/** + * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg + * @wb: bdi_writeback in question + * @pavail: out parameter for number of available pages + * @pdirty: out parameter for number of dirty pages + * @pwriteback: out parameter for number of pages under writeback + * + * Determine the numbers of available, dirty, and writeback pages in @wb's + * memcg. Dirty and writeback are self-explanatory. Available is a bit + * more involved. + * + * A memcg's headroom is "min(max, high) - used". The available memory is + * calculated as the lowest headroom of itself and the ancestors plus the + * number of pages already being used for file pages. Note that this + * doesn't consider the actual amount of available memory in the system. + * The caller should further cap *@pavail accordingly. + */ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, + unsigned long *pdirty, unsigned long *pwriteback) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + struct mem_cgroup *parent; + unsigned long head_room = PAGE_COUNTER_MAX; + unsigned long file_pages; + + *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + + /* this should eventually include NR_UNSTABLE_NFS */ + *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + + file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | + (1 << LRU_ACTIVE_FILE)); + while ((parent = parent_mem_cgroup(memcg))) { + unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long used = page_counter_read(&memcg->memory); + + head_room = min(head_room, ceiling - min(ceiling, used)); + memcg = parent; + } + + *pavail = file_pages + head_room; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * DO NOT USE IN NEW FILES. * @@ -4468,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto out_free; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + spin_lock_init(&memcg->pcp_counter_lock); return memcg; +out_free_stat: + free_percpu(memcg->stat); out_free: kfree(memcg); return NULL; @@ -4497,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); + memcg_wb_domain_exit(memcg); kfree(memcg); } @@ -4529,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; + mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4547,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); +#endif return &memcg->css; free_out: @@ -4635,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) @@ -4668,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; + memcg_wb_domain_size_changed(memcg); } #ifdef CONFIG_MMU @@ -4816,6 +4836,111 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return page; } +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must confirm following. + * - page is not on LRU (isolate_page() is useful.) + * - compound_lock is held when nr_pages > 1 + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + unsigned long flags; + int ret; + bool anon; + + VM_BUG_ON(from == to); + VM_BUG_ON_PAGE(PageLRU(page), page); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + /* + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; + + ret = -EINVAL; + if (page->mem_cgroup != from) + goto out_unlock; + + anon = PageAnon(page); + + spin_lock_irqsave(&from->move_lock, flags); + + if (!anon && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } + + /* + * move_lock grabbed above and caller set from->moving_account, so + * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * So mapping should be stable for dirty pages. + */ + if (!anon && PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + } + } + + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } + + /* + * It is safe to change page->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ + + /* caller should have done css_get */ + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + + ret = 0; + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); +out: + return ret; +} + static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { @@ -5012,7 +5137,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * tunable will only affect upcoming migrations, not the current one. * So we need to save it, and keep it going. */ - move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); if (move_flags) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -5246,7 +5371,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = ACCESS_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5276,7 +5401,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long high = ACCESS_ONCE(memcg->high); + unsigned long high = READ_ONCE(memcg->high); if (high == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5300,13 +5425,14 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + memcg_wb_domain_size_changed(memcg); return nbytes; } static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = ACCESS_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.limit); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5332,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5838,9 +5965,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); - /* XXX: caller holds IRQ-safe mapping->tree_lock */ - VM_BUG_ON(!irqs_disabled()); - + /* Caller disabled preemption with mapping->tree_lock */ mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); } @@ -5861,7 +5986,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) id = swap_cgroup_record(entry, 0); rcu_read_lock(); - memcg = mem_cgroup_lookup(id); + memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memsw, 1); |