From 3c0c9bc9c9596d5cd69529da822526f88673365b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:47 +0200 Subject: mm: vmalloc: add alloc_vmap_area trace event Patch series "Add basic trace events for vmap/vmalloc (v2)", v2. This small series add some basic trace events for the vmap/vmalloc code. Since currently we lack any, sometimes it is hard to start debuging vmap code if an issue is reported or occured. For example https://lore.kernel.org/linux-mm/Y0p8BZIiDXLQbde%2F@pc636/T/ The final patch adds two reviewers for vmalloc code. This patch (of 7): It is for debug purposes and for validation of passed parameters. Link: https://lkml.kernel.org/r/20221018181053.434508-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20221018181053.434508-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 include/trace/events/vmalloc.h (limited to 'include/trace') diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h new file mode 100644 index 000000000000..39fbd77c91e7 --- /dev/null +++ b/include/trace/events/vmalloc.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vmalloc + +#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VMALLOC_H + +#include + +/** + * alloc_vmap_area - called when a new vmap allocation occurs + * @addr: an allocated address + * @size: a requested size + * @align: a requested alignment + * @vstart: a requested start range + * @vend: a requested end range + * @failed: an allocation failed or not + * + * This event is used for a debug purpose, it can give an extra + * information for a developer about how often it occurs and which + * parameters are passed for further validation. + */ +TRACE_EVENT(alloc_vmap_area, + + TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int failed), + + TP_ARGS(addr, size, align, vstart, vend, failed), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, size) + __field(unsigned long, align) + __field(unsigned long, vstart) + __field(unsigned long, vend) + __field(int, failed) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->size = size; + __entry->align = align; + __entry->vstart = vstart; + __entry->vend = vend; + __entry->failed = failed; + ), + + TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", + __entry->addr, __entry->size, __entry->align, + __entry->vstart, __entry->vend, __entry->failed) +); + +#endif /* _TRACE_VMALLOC_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From b3a5a7b099162e1b11db459f8128d4374f7d1c05 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:48 +0200 Subject: mm: vmalloc: add purge_vmap_area_lazy trace event It is for debug purposes to track number of freed vmap areas including a range it occurs on. Link: https://lkml.kernel.org/r/20221018181053.434508-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h index 39fbd77c91e7..afeb8003a0f2 100644 --- a/include/trace/events/vmalloc.h +++ b/include/trace/events/vmalloc.h @@ -50,6 +50,39 @@ TRACE_EVENT(alloc_vmap_area, __entry->vstart, __entry->vend, __entry->failed) ); +/** + * purge_vmap_area_lazy - called when vmap areas were lazily freed + * @start: purging start address + * @end: purging end address + * @npurged: numbed of purged vmap areas + * + * This event is used for a debug purpose. It gives some + * indication about start:end range and how many objects + * are released. + */ +TRACE_EVENT(purge_vmap_area_lazy, + + TP_PROTO(unsigned long start, unsigned long end, + unsigned int npurged), + + TP_ARGS(start, end, npurged), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, npurged) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->npurged = npurged; + ), + + TP_printk("start=0x%lx end=0x%lx num_purged=%u", + __entry->start, __entry->end, __entry->npurged) +); + #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ -- cgit v1.2.3 From fabc27f7649e070c4f6c742e436a51ff68c4a280 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:49 +0200 Subject: mm: vmalloc: add free_vmap_area_noflush trace event This event is used in order to validate/debug a start address of freed VA, number of currently outstanding and maximum allowed areas. Link: https://lkml.kernel.org/r/20221018181053.434508-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h index afeb8003a0f2..ad4e02191f35 100644 --- a/include/trace/events/vmalloc.h +++ b/include/trace/events/vmalloc.h @@ -83,6 +83,40 @@ TRACE_EVENT(purge_vmap_area_lazy, __entry->start, __entry->end, __entry->npurged) ); +/** + * free_vmap_area_noflush - called when a vmap area is freed + * @va_start: a start address of VA + * @nr_lazy: number of current lazy pages + * @nr_lazy_max: number of maximum lazy pages + * + * This event is used for a debug purpose. It gives some + * indication about a VA that is released, number of current + * outstanding areas and a maximum allowed threshold before + * dropping all of them. + */ +TRACE_EVENT(free_vmap_area_noflush, + + TP_PROTO(unsigned long va_start, unsigned long nr_lazy, + unsigned long nr_lazy_max), + + TP_ARGS(va_start, nr_lazy, nr_lazy_max), + + TP_STRUCT__entry( + __field(unsigned long, va_start) + __field(unsigned long, nr_lazy) + __field(unsigned long, nr_lazy_max) + ), + + TP_fast_assign( + __entry->va_start = va_start; + __entry->nr_lazy = nr_lazy; + __entry->nr_lazy_max = nr_lazy_max; + ), + + TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", + __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) +); + #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ -- cgit v1.2.3 From f1a7941243c102a44e8847e3b94ff4ff3ec56f25 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 24 Oct 2022 05:28:41 +0000 Subject: mm: convert mm's rss stats into percpu_counter Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 +++++---------- include/linux/mm_types.h | 7 ++-- include/linux/mm_types_task.h | 13 -------- include/linux/percpu_counter.h | 1 - include/linux/sched.h | 3 -- include/trace/events/kmem.h | 8 ++--- kernel/fork.c | 16 ++++++++- mm/memory.c | 73 +++++++----------------------------------- 8 files changed, 40 insertions(+), 107 deletions(-) (limited to 'include/trace') diff --git a/include/linux/mm.h b/include/linux/mm.h index f919befc8fac..0cb4e196d60b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); - -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; + return percpu_counter_read_positive(&mm->rss_stat[member]); } -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2d5b1575ffe0..e86861ff5bbd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -626,11 +627,7 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - /* - * Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - struct mm_rss_stat rss_stat; + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 0bb4b6da9993..5414b5c6a103 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,19 +36,6 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) -#define SPLIT_RSS_COUNTING -/* per-thread cached information, */ -struct task_rss_stat { - int events; /* for synchronization threshold */ - int count[NR_MM_COUNTERS]; -}; -#endif /* USE_SPLIT_PTE_PTLOCKS */ - -struct mm_rss_stat { - atomic_long_t count[NR_MM_COUNTERS]; -}; - struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8ed5fba6d156..bde6c4c1f405 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -13,7 +13,6 @@ #include #include #include -#include /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..079d299fa465 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,9 +870,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; -#ifdef SPLIT_RSS_COUNTING - struct task_rss_stat rss_stat; -#endif int exit_state; int exit_code; int exit_signal; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 243073cfc29d..58688768ef0f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -346,10 +346,9 @@ TRACE_MM_PAGES TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, - int member, - long count), + int member), - TP_ARGS(mm, member, count), + TP_ARGS(mm, member), TP_STRUCT__entry( __field(unsigned int, mm_id) @@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (count << PAGE_SHIFT); + __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) + << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", diff --git a/kernel/fork.c b/kernel/fork.c index 08969f5aa38d..0fef202434c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -753,7 +753,7 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); + long x = percpu_counter_sum(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", @@ -779,6 +779,8 @@ static void check_mm(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { + int i; + BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); WARN_ON_ONCE(mm == current->active_mm); @@ -788,6 +790,9 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + + for (i = 0; i < NR_MM_COUNTERS; i++) + percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1107,6 +1112,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { + int i; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1148,10 +1155,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + for (i = 0; i < NR_MM_COUNTERS; i++) + if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) + goto fail_pcpu; + mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; +fail_pcpu: + while (i > 0) + percpu_counter_destroy(&mm->rss_stat[--i]); fail_nocontext: mm_free_pgd(mm); fail_nopgd: diff --git a/mm/memory.c b/mm/memory.c index 7826143ec9cd..e0555ddd71b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,58 +162,11 @@ static int __init init_zero_pfn(void) } early_initcall(init_zero_pfn); -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member) { - trace_rss_stat(mm, member, count); + trace_rss_stat(mm, member); } -#if defined(SPLIT_RSS_COUNTING) - -void sync_mm_rss(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - if (current->rss_stat.count[i]) { - add_mm_counter(mm, i, current->rss_stat.count[i]); - current->rss_stat.count[i] = 0; - } - } - current->rss_stat.events = 0; -} - -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) -{ - struct task_struct *task = current; - - if (likely(task->mm == mm)) - task->rss_stat.count[member] += val; - else - add_mm_counter(mm, member, val); -} -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) - -/* sync counter once per 64 page faults */ -#define TASK_RSS_EVENTS_THRESH (64) -static void check_sync_rss_stat(struct task_struct *task) -{ - if (unlikely(task != current)) - return; - if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - sync_mm_rss(task->mm); -} -#else /* SPLIT_RSS_COUNTING */ - -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) - -static void check_sync_rss_stat(struct task_struct *task) -{ -} - -#endif /* SPLIT_RSS_COUNTING */ - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1857,7 +1810,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -3153,12 +3106,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter(mm, mm_counter_file(old_page)); + inc_mm_counter(mm, MM_ANONPAGES); } } else { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3965,8 +3917,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); /* @@ -4146,7 +4098,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -4336,11 +4288,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); @@ -5192,9 +5144,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) -- cgit v1.2.3 From 4c9473e87e75a2a77ccd02e55c91ffe6a52b5df6 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Wed, 26 Oct 2022 10:52:18 +0530 Subject: mm/khugepaged: add tracepoint to collapse_file() "mm_khugepaged_collapse_file" for capturing is_shmem. Currently, is_shmem is not being captured. Capturing is_shmem is useful as it can indicate if tmpfs is being used as a backing store instead of persistent storage. Add the tracepoint in collapse_file() named "mm_khugepaged_collapse_file" for capturing is_shmem. [gautammenghani201@gmail.com: swap is_shmem and addr to save space, per Steven Rostedt] Link: https://lkml.kernel.org/r/20221202201807.182829-1-gautammenghani201@gmail.com Link: https://lkml.kernel.org/r/20221026052218.148234-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Reviewed-by: Steven Rostedt (Google) [tracing] Cc: David Hildenbrand Cc: Masami Hiramatsu (Google) Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 38 ++++++++++++++++++++++++++++++++++++++ mm/khugepaged.c | 7 ++++--- 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 760455dfa860..3e6fb05852f9 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -203,5 +203,43 @@ TRACE_EVENT(mm_khugepaged_scan_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_collapse_file, + TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index, + bool is_shmem, unsigned long addr, struct file *file, + int nr, int result), + TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result), + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, hpfn) + __field(pgoff_t, index) + __field(unsigned long, addr) + __field(bool, is_shmem) + __string(filename, file->f_path.dentry->d_iname) + __field(int, nr) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->hpfn = hpage ? page_to_pfn(hpage) : -1; + __entry->index = index; + __entry->addr = addr; + __entry->is_shmem = is_shmem; + __assign_str(filename, file->f_path.dentry->d_iname); + __entry->nr = nr; + __entry->result = result; + ), + + TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s", + __entry->mm, + __entry->hpfn, + __entry->index, + __entry->addr, + __entry->is_shmem, + __get_str(filename), + __entry->nr, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 913b0f489352..78ec2771cc65 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1744,12 +1744,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; - pgoff_t index, end = start + HPAGE_PMD_NR; + pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr; + int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -2102,7 +2102,8 @@ out: mem_cgroup_uncharge(page_folio(hpage)); put_page(hpage); } - /* TODO: tracepoints */ + + trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); return result; } -- cgit v1.2.3