From d42f3245c7e299e017213fa028c319316bcdb7f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Aug 2020 23:20:39 -0700 Subject: mm: memcg: convert vmstat slab counters to bytes In order to prepare for per-object slab memory accounting, convert NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE vmstat items to bytes. To make it obvious, rename them to NR_SLAB_RECLAIMABLE_B and NR_SLAB_UNRECLAIMABLE_B (similar to NR_KERNEL_STACK_KB). Internally global and per-node counters are stored in pages, however memcg and lruvec counters are stored in bytes. This scheme may look weird, but only for now. As soon as slab pages will be shared between multiple cgroups, global and node counters will reflect the total number of slab pages. However memcg and lruvec counters will be used for per-memcg slab memory tracking, which will take separate kernel objects in the account. Keeping global and node counters in pages helps to avoid additional overhead. The size of slab memory shouldn't exceed 4Gb on 32-bit machines, so it will fit into atomic_long_t we use for vmstats. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200623174037.3951353-4-guro@fb.com Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc/meminfo.c') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index e9a6841fc25b..38ea95fd919a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); - sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); - sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); + sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); -- cgit v1.2.3 From 991e7673859ed41e7ba83c8c4e57afe8cfebe314 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 6 Aug 2020 23:21:37 -0700 Subject: mm: memcontrol: account kernel stack per node Currently the kernel stack is being accounted per-zone. There is no need to do that. In addition due to being per-zone, memcg has to keep a separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of node_stat_item. In addition localize the kernel stack stats updates to account_kernel_stack(). Signed-off-by: Shakeel Butt Signed-off-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200630161539.1759185-1-shakeelb@google.com Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ++-- fs/proc/meminfo.c | 4 ++-- include/linux/memcontrol.h | 21 +++++++++++++++++-- include/linux/mmzone.h | 8 ++++---- kernel/fork.c | 51 +++++++++++++--------------------------------- kernel/scs.c | 2 +- mm/memcontrol.c | 2 +- mm/page_alloc.c | 16 +++++++-------- mm/vmstat.c | 8 ++++---- 9 files changed, 55 insertions(+), 61 deletions(-) (limited to 'fs/proc/meminfo.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 0cf13e31603c..508b80f6329b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_FILE_MAPPED)), nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), nid, K(i.sharedram), - nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), + nid, node_page_state(pgdat, NR_KERNEL_STACK_KB), #ifdef CONFIG_SHADOW_CALL_STACK - nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB), + nid, node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), nid, 0UL, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 38ea95fd919a..2a4c58f70fb9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", - global_zone_page_state(NR_KERNEL_STACK_KB)); + global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", - global_zone_page_state(NR_KERNEL_SCS_KB)); + global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5a8b62d075e6..624400c27eba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,8 +32,6 @@ struct kmem_cache; enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, - /* XXX: why are these zone and not node counters? */ - MEMCG_KERNEL_STACK_KB, MEMCG_NR_STAT, }; @@ -729,8 +727,19 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); + void mod_memcg_obj_state(void *p, int idx, int val); +static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_lruvec_slab_state(p, idx, val); + local_irq_restore(flags); +} + static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { @@ -1151,6 +1160,14 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, __mod_node_page_state(page_pgdat(page), idx, val); } +static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + struct page *page = virt_to_head_page(p); + + mod_node_page_state(page_pgdat(page), idx, val); +} + static inline void mod_memcg_obj_state(void *p, int idx, int val) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b79100edd228..a3bd54139a30 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -155,10 +155,6 @@ enum zone_stat_item { NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK_KB, /* measured in KiB */ -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - NR_KERNEL_SCS_KB, /* measured in KiB */ -#endif /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) @@ -203,6 +199,10 @@ enum node_stat_item { NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ + NR_KERNEL_STACK_KB, /* measured in KiB */ +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + NR_KERNEL_SCS_KB, /* measured in KiB */ +#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/kernel/fork.c b/kernel/fork.c index 76d3f3387554..c7b4ce9d2647 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk) if (vm) { int i; - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - -(int)(PAGE_SIZE / 1024)); - + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); - } for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], @@ -382,31 +377,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); - - if (vm) { - int i; - - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_zone_page_state(page_zone(vm->pages[i]), - NR_KERNEL_STACK_KB, - PAGE_SIZE / 1024 * account); - } - } else { - /* - * All stack pages are in the same zone and belong to the - * same memcg. - */ - struct page *first_page = virt_to_page(stack); - - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); - - mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - } + /* All stack pages are in the same node. */ + if (vm) + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + else + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } static int memcg_charge_kernel_stack(struct task_struct *tsk) @@ -415,24 +393,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) struct vm_struct *vm = task_stack_vm_area(tsk); int ret; + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + if (vm) { int i; + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and both memcg_kmem_uncharge_page() - * and mod_memcg_page_state() in free_thread_stack() - * will ignore this page. So it's safe. + * pointer is NULL, and memcg_kmem_uncharge_page() in + * free_thread_stack() will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) return ret; - - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - PAGE_SIZE / 1024); } } #endif diff --git a/kernel/scs.c b/kernel/scs.c index 5d4d9bbdec36..4ff4a7ba0094 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -17,7 +17,7 @@ static void __scs_account(void *s, int account) { struct page *scs_page = virt_to_page(s); - mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 473f9b91d51f..a3e963366769 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) * 1024); seq_buf_printf(&s, "slab %llu\n", (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9ad093814d2..8d5d8526c2f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5396,6 +5396,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" + " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5417,6 +5421,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + node_page_state(pgdat, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + node_page_state(pgdat, NR_KERNEL_SCS_KB), +#endif pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5448,10 +5456,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " kernel_stack:%lukB" -#ifdef CONFIG_SHADOW_CALL_STACK - " shadow_call_stack:%lukB" -#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5473,10 +5477,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - zone_page_state(zone, NR_KERNEL_STACK_KB), -#ifdef CONFIG_SHADOW_CALL_STACK - zone_page_state(zone, NR_KERNEL_SCS_KB), -#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), diff --git a/mm/vmstat.c b/mm/vmstat.c index b171a76bfe83..2b866cbab11d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1140,10 +1140,6 @@ const char * const vmstat_text[] = { "nr_zone_write_pending", "nr_mlock", "nr_page_table_pages", - "nr_kernel_stack", -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - "nr_shadow_call_stack", -#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1194,6 +1190,10 @@ const char * const vmstat_text[] = { "nr_kernel_misc_reclaimable", "nr_foll_pin_acquired", "nr_foll_pin_released", + "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", -- cgit v1.2.3 From 1455083c1d70f69028df0eb92b69fd24db277e4b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 6 Aug 2020 23:23:03 -0700 Subject: proc/meminfo: avoid open coded reading of vm_committed_as Patch series "make vm_committed_as_batch aware of vm overcommit policy", v6. When checking a performance change for will-it-scale scalability mmap test [1], we found very high lock contention for spinlock of percpu counter 'vm_committed_as': 94.14% 0.35% [kernel.kallsyms] [k] _raw_spin_lock_irqsave 48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap; 45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap; Actually this heavy lock contention is not always necessary. The 'vm_committed_as' needs to be very precise when the strict OVERCOMMIT_NEVER policy is set, which requires a rather small batch number for the percpu counter. So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and enlarge it for not-so-strict OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies. Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T desktop, and 2097%(20X) on a 4S/72C/144T server. And for that case, whether it shows improvements depends on if the test mmap size is bigger than the batch number computed. We tested 10+ platforms in 0day (server, desktop and laptop). If we lift it to 64X, 80%+ platforms show improvements, and for 16X lift, 1/3 of the platforms will show improvements. And generally it should help the mmap/unmap usage,as Michal Hocko mentioned: : I believe that there are non-synthetic worklaods which would benefit : from a larger batch. E.g. large in memory databases which do large : mmaps during startups from multiple threads. Note: There are some style complain from checkpatch for patch 4, as sysctl handler declaration follows the similar format of sibling functions [1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/ This patch (of 4): Use the existing vm_memory_committed() instead, which is also convenient for future change. Signed-off-by: Feng Tang Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Mel Gorman Cc: Qian Cai Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Dave Hansen Cc: Huang Ying Cc: Christoph Lameter Cc: Dennis Zhou Cc: Haiyang Zhang Cc: kernel test robot Cc: "K. Y. Srinivasan" Cc: Tejun Heo Link: http://lkml.kernel.org/r/1594389708-60781-1-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1594389708-60781-2-git-send-email-feng.tang@intel.com Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc/meminfo.c') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 2a4c58f70fb9..887a5532e449 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -41,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); - committed = percpu_counter_read_positive(&vm_committed_as); + committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; -- cgit v1.2.3