diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 188 |
1 files changed, 164 insertions, 24 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3c15bb07cce..2156ef775d04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include <linux/lockdep.h> #include <linux/file.h> #include <linux/tracehook.h> +#include <linux/psi.h> #include <linux/seq_buf.h> #include "internal.h" #include <net/sock.h> @@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; +#endif static int memcg_shrinker_map_size; static DEFINE_MUTEX(memcg_shrinker_map_mutex); @@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } } -#else /* CONFIG_MEMCG_KMEM */ -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - return 0; -} -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } -#endif /* CONFIG_MEMCG_KMEM */ - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); memcg = stock->cached; - if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) - continue; - if (!mem_cgroup_is_descendant(memcg, root_memcg)) { - css_put(&memcg->css); - continue; - } - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (memcg && stock->nr_pages && + mem_cgroup_is_descendant(memcg, root_memcg)) + flush = true; + rcu_read_unlock(); + + if (flush && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); else schedule_work_on(cpu, &stock->work); } - css_put(&memcg->css); } put_cpu(); mutex_unlock(&percpu_charge_mutex); @@ -2359,11 +2354,67 @@ static void high_work_func(struct work_struct *work) } /* + * Clamp the maximum sleep time per allocation batch to 2 seconds. This is + * enough to still cause a significant slowdown in most cases, while still + * allowing diagnostics and tracing to proceed without becoming stuck. + */ +#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) + +/* + * When calculating the delay, we use these either side of the exponentiation to + * maintain precision and scale to a reasonable number of jiffies (see the table + * below. + * + * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the + * overage ratio to a delay. + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the + * proposed penalty in order to reduce to a reasonable number of jiffies, and + * to produce a reasonable delay curve. + * + * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a + * reasonable delay curve compared to precision-adjusted overage, not + * penalising heavily at first, but still making sure that growth beyond the + * limit penalises misbehaviour cgroups by slowing them down exponentially. For + * example, with a high of 100 megabytes: + * + * +-------+------------------------+ + * | usage | time to allocate in ms | + * +-------+------------------------+ + * | 100M | 0 | + * | 101M | 6 | + * | 102M | 25 | + * | 103M | 57 | + * | 104M | 102 | + * | 105M | 159 | + * | 106M | 230 | + * | 107M | 313 | + * | 108M | 409 | + * | 109M | 518 | + * | 110M | 639 | + * | 111M | 774 | + * | 112M | 921 | + * | 113M | 1081 | + * | 114M | 1254 | + * | 115M | 1439 | + * | 116M | 1638 | + * | 117M | 1849 | + * | 118M | 2000 | + * | 119M | 2000 | + * | 120M | 2000 | + * +-------+------------------------+ + */ + #define MEMCG_DELAY_PRECISION_SHIFT 20 + #define MEMCG_DELAY_SCALING_SHIFT 14 + +/* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ void mem_cgroup_handle_over_high(void) { + unsigned long usage, high, clamped_high; + unsigned long pflags; + unsigned long penalty_jiffies, overage; unsigned int nr_pages = current->memcg_nr_pages_over_high; struct mem_cgroup *memcg; @@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void) memcg = get_mem_cgroup_from_mm(current->mm); reclaim_high(memcg, nr_pages, GFP_KERNEL); - css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + * + * We use overage compared to memory.high to calculate the number of + * jiffies to sleep (penalty_jiffies). Ideally this value should be + * fairly lenient on small overages, and increasingly harsh when the + * memcg in question makes it clear that it has no intention of stopping + * its crazy behaviour, so we exponentially increase the delay based on + * overage amount. + */ + + usage = page_counter_read(&memcg->memory); + high = READ_ONCE(memcg->high); + + if (usage <= high) + goto out; + + /* + * Prevent division by 0 in overage calculation by acting as if it was a + * threshold of 1 page + */ + clamped_high = max(high, 1UL); + + overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, + clamped_high); + + penalty_jiffies = ((u64)overage * overage * HZ) + >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); + + /* + * Factor in the task's own contribution to the overage, such that four + * N-sized allocations are throttled approximately the same as one + * 4N-sized allocation. + * + * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or + * larger the current charge patch is than that. + */ + penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + + /* + * Don't sleep if the amount of jiffies this memcg owes us is so low + * that it's not even worth doing, in an attempt to be nice to those who + * go only a small amount over their memory.high value and maybe haven't + * been aggressively reclaimed enough yet. + */ + if (penalty_jiffies <= HZ / 100) + goto out; + + /* + * If we exit early, we're guaranteed to die (since + * schedule_timeout_killable sets TASK_KILLABLE). This means we don't + * need to account for any ill-begotten jiffies to pay them off later. + */ + psi_memstall_enter(&pflags); + schedule_timeout_killable(penalty_jiffies); + psi_memstall_leave(&pflags); + +out: + css_put(&memcg->css); } static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -3512,6 +3630,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: @@ -4805,11 +4926,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) } } -static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) -{ - mem_cgroup_id_get_many(memcg, 1); -} - static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) { mem_cgroup_id_put_many(memcg, 1); @@ -4955,6 +5071,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->cgwb_frn[i].done = __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; +#endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: @@ -5333,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page, __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && !list_empty(page_deferred_list(page))) { + spin_lock(&from->deferred_split_queue.split_queue_lock); + list_del_init(page_deferred_list(page)); + from->deferred_split_queue.split_queue_len--; + spin_unlock(&from->deferred_split_queue.split_queue_lock); + } +#endif /* * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with @@ -5341,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ page->mem_cgroup = to; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && list_empty(page_deferred_list(page))) { + spin_lock(&to->deferred_split_queue.split_queue_lock); + list_add_tail(page_deferred_list(page), + &to->deferred_split_queue.split_queue); + to->deferred_split_queue.split_queue_len++; + spin_unlock(&to->deferred_split_queue.split_queue_lock); + } +#endif + spin_unlock_irqrestore(&from->move_lock, flags); ret = 0; @@ -6511,7 +6651,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned int nr_pages = 1; if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); + nr_pages = compound_nr(page); ug->nr_huge += nr_pages; } if (PageAnon(page)) @@ -6523,7 +6663,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->pgpgout++; } else { - ug->nr_kmem += 1 << compound_order(page); + ug->nr_kmem += compound_nr(page); __ClearPageKmemcg(page); } |