From f733c6b508bcaa3441ba1eacf16efb9abd47489f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Oct 2019 15:57:29 +0300 Subject: perf/core: Fix inheritance of aux_output groups Commit: ab43762ef010 ("perf: Allow normal events to output AUX data") forgets to configure aux_output relation in the inherited groups, which results in child PEBS events forever failing to schedule. Fix this by setting up the AUX output link in the inheritance path. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191004125729.32397-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3f0cb82e4fbc..f953dd16a5e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11862,6 +11862,10 @@ static int inherit_group(struct perf_event *parent_event, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); + + if (sub->aux_event == parent_event && + !perf_get_aux_event(child_ctr, leader)) + return -EINVAL; } return 0; } -- cgit v1.2.3 From d44248a41337731a111374822d7d4451b64e73e4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 4 Sep 2019 14:46:18 -0700 Subject: perf/core: Rework memory accounting in perf_mmap() perf_mmap() always increases user->locked_vm. As a result, "extra" could grow bigger than "user_extra", which doesn't make sense. Here is an example case: (Note: Assume "user_lock_limit" is very small.) | # of perf_mmap calls |vma->vm_mm->pinned_vm|user->locked_vm| | 0 | 0 | 0 | | 1 | user_extra | user_extra | | 2 | 3 * user_extra | 2 * user_extra| | 3 | 6 * user_extra | 3 * user_extra| | 4 | 10 * user_extra | 4 * user_extra| Fix this by maintaining proper user_extra and extra. Reviewed-By: Hechao Li Reported-by: Hechao Li Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Jie Meng Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190904214618.3795672-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f953dd16a5e2..2b8265ad7bf5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5668,7 +5668,8 @@ again: * undo the VM accounting. */ - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, + &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); @@ -5812,8 +5813,20 @@ accounting: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - if (user_locked > user_lock_limit) + if (user_locked <= user_lock_limit) { + /* charge all to locked_vm */ + } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) { + /* charge all to pinned_vm */ + extra = user_extra; + user_extra = 0; + } else { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ extra = user_locked - user_lock_limit; + user_extra -= extra; + } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; -- cgit v1.2.3 From 7fa343b7fdc4f351de4e3f28d5c285937dd1f42f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Oct 2019 09:59:49 -0700 Subject: perf/core: Fix corner case in perf_rotate_context() In perf_rotate_context(), when the first cpu flexible event fail to schedule, cpu_rotate is 1, while cpu_event is NULL. Since cpu_event is NULL, perf_rotate_context will _NOT_ call cpu_ctx_sched_out(), thus cpuctx->ctx.is_active will have EVENT_FLEXIBLE set. Then, the next perf_event_sched_in() will skip all cpu flexible events because of the EVENT_FLEXIBLE bit. In the next call of perf_rotate_context(), cpu_rotate stays 1, and cpu_event stays NULL, so this process repeats. The end result is, flexible events on this cpu will not be scheduled (until another event being added to the cpuctx). Here is an easy repro of this issue. On Intel CPUs, where ref-cycles could only use one counter, run one pinned event for ref-cycles, one flexible event for ref-cycles, and one flexible event for cycles. The flexible ref-cycles is never scheduled, which is expected. However, because of this issue, the cycles event is never scheduled either. $ perf stat -e ref-cycles:D,ref-cycles,cycles -C 5 -I 1000 time counts unit events 1.000152973 15,412,480 ref-cycles:D 1.000152973 ref-cycles (0.00%) 1.000152973 cycles (0.00%) 2.000486957 18,263,120 ref-cycles:D 2.000486957 ref-cycles (0.00%) 2.000486957 cycles (0.00%) To fix this, when the flexible_active list is empty, try rotate the first event in the flexible_groups. Also, rename ctx_first_active() to ctx_event_to_rotate(), which is more accurate. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Fixes: 8d5bce0c37fa ("perf/core: Optimize perf_rotate_context() event scheduling") Link: https://lkml.kernel.org/r/20191008165949.920548-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2b8265ad7bf5..9ec0b0bfddbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3779,11 +3779,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) perf_event_groups_insert(&ctx->flexible_groups, event); } +/* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_first_active(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_context *ctx) { - return list_first_entry_or_null(&ctx->flexible_active, - struct perf_event, active_list); + struct perf_event *event; + + /* pick the first active flexible event */ + event = list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); + + /* if no active flexible event, pick the first event */ + if (!event) { + event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), + typeof(*event), group_node); + } + + return event; } static bool perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3808,9 +3820,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(task_ctx); + task_event = ctx_event_to_rotate(task_ctx); if (cpu_rotate) - cpu_event = ctx_first_active(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(&cpuctx->ctx); /* * As per the order given at ctx_resched() first 'pop' task flexible -- cgit v1.2.3