From 93315e46b000fc80fff5d53c3f444417fb3df6de Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:00 +0530 Subject: perf/core: Add speculation info to branch entries Add a new "spec" bitfield to branch entries for providing speculation information. This will be populated using hints provided by branch sampling features on supported hardware. The following cases are covered: * No branch speculation information is available * Branch is speculative but taken on the wrong path * Branch is non-speculative but taken on the correct path * Branch is speculative and taken on the correct path Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/834088c302faf21c7b665031dd111f424e509a64.1660211399.git.sandipan.das@amd.com --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee8b9ecdc03b..ae30c61957d2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1078,6 +1078,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b br->abort = 0; br->cycles = 0; br->type = 0; + br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 03b370062741..30a4723aefd4 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -256,6 +256,17 @@ enum { PERF_BR_MAX, }; +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1363,6 +1374,7 @@ union perf_mem_data_src { * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) * type: branch type + * spec: branch speculation info (or 0 if not supported) */ struct perf_branch_entry { __u64 from; @@ -1373,7 +1385,8 @@ struct perf_branch_entry { abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ type:4, /* branch type */ - reserved:40; + spec:2, /* branch speculation info */ + reserved:38; }; union perf_sample_weight { -- cgit v1.2.3 From a724ec82966d57e4b5d36341d3e3dc1a3c011564 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:15 +0530 Subject: perf: Add system error and not in transaction branch types This expands generic branch type classification by adding two more entries there in i.e system error and not in transaction. This also updates the x86 implementation to process X86_BR_NO_TX records as appropriate. This changes branch types reported to user space on x86 platform but it should not be a problem. The possible scenarios and impacts are enumerated here. -------------------------------------------------------------------------- | kernel | perf tool | Impact | -------------------------------------------------------------------------- | old | old | Works as before | -------------------------------------------------------------------------- | old | new | PERF_BR_UNKNOWN is processed | -------------------------------------------------------------------------- | new | old | PERF_BR_NO_TX is blocked via old PERF_BR_MAX | -------------------------------------------------------------------------- | new | new | PERF_BR_NO_TX is recognized | -------------------------------------------------------------------------- When PERF_BR_NO_TX is blocked via old PERF_BR_MAX (new kernel with old perf tool) the user space might throw up an warning complaining about an unrecognized branch types being reported, but it's expected. PERF_BR_SERROR & PERF_BR_NO_TX branch types will be used for BRBE implementation on arm64 platform. PERF_BR_NO_TX complements 'abort' and 'in_tx' elements in perf_branch_entry which represent other transaction states for a given branch record. Because this completes the transaction state classification. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-2-anshuman.khandual@arm.com --- arch/x86/events/utils.c | 2 +- include/uapi/linux/perf_event.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index e013243f360c..5f5617afde79 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -225,7 +225,7 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = { PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ PERF_BR_UNKNOWN, /* X86_BR_ABORT */ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_NO_TX, /* X86_BR_NO_TX */ PERF_BR_CALL, /* X86_BR_ZERO_CALL */ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ PERF_BR_IND, /* X86_BR_IND_JMP */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 30a4723aefd4..a79cc0eb4de7 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -253,6 +253,8 @@ enum { PERF_BR_COND_RET = 10, /* conditional function return */ PERF_BR_ERET = 11, /* exception return */ PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ PERF_BR_MAX, }; -- cgit v1.2.3 From b190bc4ac9e6d9763b61654c5a0c085ff77d7a09 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:16 +0530 Subject: perf: Extend branch type classification branch_entry.type now has ran out of space to accommodate more branch types classification. This will prevent perf branch stack implementation on arm64 (via BRBE) to capture all available branch types. Extending this bit field i.e branch_entry.type [4 bits] is not an option as it will break user space ABI both for little and big endian perf tools. Extend branch classification with a new field branch_entry.new_type via a new branch type PERF_BR_EXTEND_ABI in branch_entry.type. Perf tools which could decode PERF_BR_EXTEND_ABI, will then parse branch_entry.new_type as well. branch_entry.new_type is a 4 bit field which can hold upto 16 branch types. The first three branch types will hold various generic page faults followed by five architecture specific branch types, which can be overridden by the platform for specific use cases. These architecture specific branch types gets overridden on arm64 platform for BRBE implementation. New generic branch types - PERF_BR_NEW_FAULT_ALGN - PERF_BR_NEW_FAULT_DATA - PERF_BR_NEW_FAULT_INST New arch specific branch types - PERF_BR_NEW_ARCH_1 - PERF_BR_NEW_ARCH_2 - PERF_BR_NEW_ARCH_3 - PERF_BR_NEW_ARCH_4 - PERF_BR_NEW_ARCH_5 Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-3-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index a79cc0eb4de7..fed60e6b10e5 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -255,6 +255,7 @@ enum { PERF_BR_IRQ = 12, /* irq */ PERF_BR_SERROR = 13, /* system error */ PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ PERF_BR_MAX, }; @@ -269,6 +270,18 @@ enum { PERF_BR_SPEC_MAX, }; +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1388,7 +1401,8 @@ struct perf_branch_entry { cycles:16, /* cycle count to last branch */ type:4, /* branch type */ spec:2, /* branch speculation info */ - reserved:38; + new_type:4, /* additional branch type */ + reserved:34; }; union perf_sample_weight { -- cgit v1.2.3 From 5402d25aa5710d240040f73fb13d7d5c303ef071 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:17 +0530 Subject: perf: Capture branch privilege information Platforms like arm64 could capture privilege level information for all the branch records. Hence this adds a new element in the struct branch_entry to record the privilege level information, which could be requested through a new event.attr.branch_sample_type based flag PERF_SAMPLE_BRANCH_PRIV_SAVE. This flag helps user choose whether privilege information is captured. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-4-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fed60e6b10e5..1a258d45a3fa 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -233,6 +235,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -282,6 +286,13 @@ enum { PERF_BR_NEW_MAX, }; +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1402,7 +1413,8 @@ struct perf_branch_entry { type:4, /* branch type */ spec:2, /* branch speculation info */ new_type:4, /* additional branch type */ - reserved:34; + priv:3, /* privilege level */ + reserved:31; }; union perf_sample_weight { -- cgit v1.2.3 From f4054e522531038354bea5c924f286fdd8ae77b5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:18 +0530 Subject: perf: Add PERF_BR_NEW_ARCH_[N] map for BRBE on arm64 platform BRBE captured branch types will overflow perf_branch_entry.type and generic branch types in perf_branch_entry.new_type. So override each available arch specific branch type in the following manner to comprehensively process all reported branch types in BRBE. PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-5-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1a258d45a3fa..dca16582885f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -293,6 +293,12 @@ enum { PERF_BR_PRIV_HV = 3, }; +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ -- cgit v1.2.3 From c5b81449f915a28bb9c7725e53aebab3ba39b4a2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:07 +0200 Subject: perf/hw_breakpoint: Provide hw_breakpoint_is_used() and use in test Provide hw_breakpoint_is_used() to check if breakpoints are in use on the system. Use it in the KUnit test to verify the global state before and after a test case. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-3-elver@google.com --- include/linux/hw_breakpoint.h | 3 +++ kernel/events/hw_breakpoint.c | 29 +++++++++++++++++++++++++++++ kernel/events/hw_breakpoint_test.c | 12 +++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 78dd7035d1e5..a3fb846705eb 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -74,6 +74,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); +extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); @@ -121,6 +122,8 @@ register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } +static inline bool hw_breakpoint_is_used(void) { return false; } + static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..fd5cd1f9e7fc 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -604,6 +604,35 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < nr_slots[type]; ++slot) { + if (info->tsk_pinned[slot]) + return true; + } + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c index 433c5c45e2a5..5ced822df788 100644 --- a/kernel/events/hw_breakpoint_test.c +++ b/kernel/events/hw_breakpoint_test.c @@ -294,7 +294,14 @@ static struct kunit_case hw_breakpoint_test_cases[] = { static int test_init(struct kunit *test) { /* Most test cases want 2 distinct CPUs. */ - return num_online_cpus() < 2 ? -EINVAL : 0; + if (num_online_cpus() < 2) + return -EINVAL; + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + return -EBUSY; + + return 0; } static void test_exit(struct kunit *test) @@ -308,6 +315,9 @@ static void test_exit(struct kunit *test) kthread_stop(__other_task); __other_task = NULL; } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); } static struct kunit_suite hw_breakpoint_test_suite = { -- cgit v1.2.3 From 0370dc314df35579b751d1b77c9169f071444962 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:09 +0200 Subject: perf/hw_breakpoint: Optimize list of per-task breakpoints On a machine with 256 CPUs, running the recently added perf breakpoint benchmark results in: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 236.418 [sec] | | 123134.794271 usecs/op | 7880626.833333 usecs/op/cpu The benchmark tests inherited breakpoint perf events across many threads. Looking at a perf profile, we can see that the majority of the time is spent in various hw_breakpoint.c functions, which execute within the 'nr_bp_mutex' critical sections which then results in contention on that mutex as well: 37.27% [kernel] [k] osq_lock 34.92% [kernel] [k] mutex_spin_on_owner 12.15% [kernel] [k] toggle_bp_slot 11.90% [kernel] [k] __reserve_bp_slot The culprit here is task_bp_pinned(), which has a runtime complexity of O(#tasks) due to storing all task breakpoints in the same list and iterating through that list looking for a matching task. Clearly, this does not scale to thousands of tasks. Instead, make use of the "rhashtable" variant "rhltable" which stores multiple items with the same key in a list. This results in average runtime complexity of O(1) for task_bp_pinned(). With the optimization, the benchmark shows: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.208 [sec] | | 108.422396 usecs/op | 6939.033333 usecs/op/cpu On this particular setup that's a speedup of ~1135x. While one option would be to make task_struct a breakpoint list node, this would only further bloat task_struct for infrequently used data. Furthermore, after all optimizations in this series, there's no evidence it would result in better performance: later optimizations make the time spent looking up entries in the hash table negligible (we'll reach the theoretical ideal performance i.e. no constraints). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com --- include/linux/perf_event.h | 3 ++- kernel/events/hw_breakpoint.c | 56 +++++++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ae30c61957d2..1999408a9cbb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { }; #ifdef CONFIG_HAVE_HW_BREAKPOINT +#include #include #endif @@ -178,7 +179,7 @@ struct hw_perf_event { * creation and event initalization. */ struct arch_hw_breakpoint info; - struct list_head bp_list; + struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6076c6346291..6d09edc80d19 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -26,10 +26,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) } /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; static int constraints_initialized; @@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, /* * Add/remove the given breakpoint in our constraint table */ -static void +static int toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { @@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + return 0; } /* Pinned counter task profiling */ @@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, toggle_bp_task_slot(bp, cpu, type, weight); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); else - list_del(&bp->hw.bp_list); + return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) @@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) @@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { int cpu, err_cpu; - int i; + int i, ret; for (i = 0; i < TYPE_MAX; i++) nr_slots[i] = hw_breakpoint_slots(i); @@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void) info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; + if (!info->tsk_pinned) { + ret = -ENOMEM; + goto err; + } } } + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + goto err; + constraints_initialized = 1; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - err_alloc: +err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) kfree(get_bp_info(err_cpu, i)->tsk_pinned); @@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void) break; } - return -ENOMEM; + return ret; } - - -- cgit v1.2.3 From 9caf87be118f4639537404eeb67dd444a3716e9a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:12 +0200 Subject: perf/hw_breakpoint: Make hw_breakpoint_weight() inlinable Due to being a __weak function, hw_breakpoint_weight() will cause the compiler to always emit a call to it. This generates unnecessarily bad code (register spills etc.) for no good reason; in fact it appears in profiles of `perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512`: ... 0.70% [kernel] [k] hw_breakpoint_weight ... While a small percentage, no architecture defines its own hw_breakpoint_weight() nor are there users outside hw_breakpoint.c, which makes the fact it is currently __weak a poor choice. Change hw_breakpoint_weight()'s definition to follow a similar protocol to hw_breakpoint_slots(), such that if defines hw_breakpoint_weight(), we'll use it instead. The result is that it is inlined and no longer shows up in profiles. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-8-elver@google.com --- include/linux/hw_breakpoint.h | 1 - kernel/events/hw_breakpoint.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a3fb846705eb..f319bd26b030 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -80,7 +80,6 @@ extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); -int hw_breakpoint_weight(struct perf_event *bp); int arch_reserve_bp_slot(struct perf_event *bp); void arch_release_bp_slot(struct perf_event *bp); void arch_unregister_hw_breakpoint(struct perf_event *bp); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9fb66d358d81..9c9bf17666a5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -124,10 +124,12 @@ err: } #endif -__weak int hw_breakpoint_weight(struct perf_event *bp) +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { -- cgit v1.2.3 From 01fe8a3f818e1074a9a95d624be4549ee7ea2b2b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:15 +0200 Subject: locking/percpu-rwsem: Add percpu_is_write_locked() and percpu_is_read_locked() Implement simple accessors to probe percpu-rwsem's locked state: percpu_is_write_locked(), percpu_is_read_locked(). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-11-elver@google.com --- include/linux/percpu-rwsem.h | 6 ++++++ kernel/locking/percpu-rwsem.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5fda40f97fe9..36b942b67b7d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) +{ + return atomic_read(&sem->block); +} + extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 5fe4c5495ba3..185bd1c906b0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any -- cgit v1.2.3 From 3aac580d5cc3001ca1627725b3b61edb529f341d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:54 -0700 Subject: perf: Add sample_flags to indicate the PMU-filled sample data On some platforms, some data e.g., timestamps, can be retrieved from the PMU driver. Usually, the data from the PMU driver is more accurate. The current perf kernel should output the PMU-filled sample data if it's available. To check the availability of the PMU-filled sample data, the current perf kernel initializes the related fields in the perf_sample_data_init(). When outputting a sample, the perf checks whether the field is updated by the PMU driver. If yes, the updated value will be output. If not, the perf uses an SW way to calculate the value or just outputs the initialized value if an SW way is unavailable either. With more and more data being provided by the PMU driver, more fields has to be initialized in the perf_sample_data_init(). That will increase the number of cache lines touched in perf_sample_data_init() and be harmful to the performance. Add new "sample_flags" to indicate the PMU-filled sample data. The PMU driver should set the corresponding PERF_SAMPLE_ flag when the field is updated. The initialization of the corresponding field is not required anymore. The following patches will make use of it and remove the corresponding fields from the perf_sample_data_init(), which will further minimize the number of cache lines touched. Only clear the sample flags that have already been done by the PMU driver in the perf_prepare_sample() for the PERF_RECORD_SAMPLE. For the other PERF_RECORD_ event type, the sample data is not available. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 17 +++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1999408a9cbb..0978165a2d87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1008,6 +1008,7 @@ struct perf_sample_data { * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ + u64 sample_flags; u64 addr; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; @@ -1057,6 +1058,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ + data->sample_flags = 0; data->addr = addr; data->raw = NULL; data->br_stack = NULL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..c9b9cb79231a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6794,11 +6794,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6827,7 +6826,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7303,6 +7302,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7310,7 +7310,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); -- cgit v1.2.3 From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:56 -0700 Subject: perf: Use sample_flags for branch stack Use the new sample_flags to indicate whether the branch stack is filled by the PMU driver. Remove the br_stack from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/amd/core.c | 4 +++- arch/x86/events/core.c | 4 +++- arch/x86/events/intel/core.c | 4 +++- arch/x86/events/intel/ds.c | 5 ++++- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 4 ++-- 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 13919eb96931..1ad1efdb33f9 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2297,6 +2297,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 36bede1d7b1e..bd99d2ae14c3 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -929,8 +929,10 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f969410d0c90..bb34a28fa71b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1714,8 +1714,10 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..ba101c28dcc9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2995,8 +2995,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cdd857bd2dd6..0489f750baa0 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1640,8 +1640,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_TIME; } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1791,6 +1793,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0978165a2d87..1e12e79454e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1011,7 +1011,6 @@ struct perf_sample_data { u64 sample_flags; u64 addr; struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; @@ -1021,6 +1020,8 @@ struct perf_sample_data { * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + u64 type; u64 ip; struct { @@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->sample_flags = 0; data->addr = addr; data->raw = NULL; - data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; diff --git a/kernel/events/core.c b/kernel/events/core.c index c9b9cb79231a..104c0c9f4e6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7052,7 +7052,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr @@ -7358,7 +7358,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { if (perf_sample_save_hw_index(event)) size += sizeof(u64); -- cgit v1.2.3 From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:57 -0700 Subject: perf: Use sample_flags for weight Use the new sample_flags to indicate whether the weight field is filled by the PMU driver. Remove the weight field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 5 +++-- arch/x86/events/intel/ds.c | 10 +++++++--- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1ad1efdb33f9..a5c95a2006ea 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2305,9 +2305,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && - ppmu->get_mem_weight) + ppmu->get_mem_weight) { ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); - + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } else if (period) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 0489f750baa0..4c51118e4add 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1527,8 +1527,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source @@ -1620,9 +1622,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); - + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); @@ -1764,6 +1767,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e12e79454e0..06a587b5faa9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1021,6 +1020,7 @@ struct perf_sample_data { * perf_{prepare,output}_sample(). */ struct perf_branch_stack *br_stack; + union perf_sample_weight weight; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 104c0c9f4e6f..f0af45db02b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7408,6 +7408,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:58 -0700 Subject: perf: Use sample_flags for data_src Use the new sample_flags to indicate whether the data_src field is filled by the PMU driver. Remove the data_src field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a5c95a2006ea..6ec7069e6482 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2301,8 +2301,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && - ppmu->get_mem_data_src) + ppmu->get_mem_data_src) { ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + data.sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && ppmu->get_mem_weight) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4c51118e4add..bde73d492889 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1535,8 +1535,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1770,8 +1772,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06a587b5faa9..6849f10dfc7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1013,7 +1013,6 @@ struct perf_sample_data { struct perf_raw_record *raw; u64 period; u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { */ struct perf_branch_stack *br_stack; union perf_sample_weight weight; + union perf_mem_data_src data_src; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index f0af45db02b3..163e2f478e61 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7411,6 +7411,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) data->weight.full = 0; + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:59 -0700 Subject: perf: Use sample_flags for txn Use the new sample_flags to indicate whether the txn field is filled by the PMU driver. Remove the txn field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bde73d492889..a5275c235c2a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,9 +1628,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1780,9 +1782,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6849f10dfc7e..581880ddb9ef 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - u64 txn; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; union perf_sample_weight weight; union perf_mem_data_src data_src; + u64 txn; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->txn = 0; } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 163e2f478e61..15d27b14c827 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = PERF_MEM_NA; + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From 03b02db93be407103c385814033633364674a6f6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 6 Sep 2022 14:14:14 +0530 Subject: perf: Consolidate branch sample filter helpers Besides the branch type filtering requests, 'event.attr.branch_sample_type' also contains various flags indicating which additional information should be captured, along with the base branch record. These flags help configure the underlying hardware, and capture the branch records appropriately when required e.g after PMU interrupt. But first, this moves an existing helper perf_sample_save_hw_index() into the header before adding some more helpers for other branch sample filter flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220906084414.396220-1-anshuman.khandual@arm.com --- include/linux/perf_event.h | 26 ++++++++++++++++++++++++++ kernel/events/core.c | 9 ++------- 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 581880ddb9ef..a627528e5f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1685,4 +1685,30 @@ static inline void perf_lopwr_cb(bool mode) } #endif +#ifdef CONFIG_PERF_EVENTS +static inline bool branch_sample_no_flags(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; +} + +static inline bool branch_sample_no_cycles(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; +} + +static inline bool branch_sample_type(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; +} + +static inline bool branch_sample_hw_index(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + +static inline bool branch_sample_priv(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; +} +#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 15d27b14c827..00389d5f9241 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6966,11 +6966,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7059,7 +7054,7 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7359,7 +7354,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr -- cgit v1.2.3 From 7517f08b9a5eef0fa683b976c97d6178d00e6a3d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:21 +0530 Subject: perf/core: Expand PERF_EVENT_FLAG_ARCH Two hardware event flags on x86 platform has overshot PERF_EVENT_FLAG_ARCH (0x0000ffff). These flags are PERF_X86_EVENT_PEBS_LAT_HYBRID (0x20000) and PERF_X86_EVENT_AMD_BRS (0x10000). Lets expand PERF_EVENT_FLAG_ARCH mask to accommodate those flags, and also create room for two more in the future. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-2-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a627528e5f3a..3e3c07512b75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -138,7 +138,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 /** -- cgit v1.2.3 From f67dd218fafd9de9a13d095e775b621db76a058f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:22 +0530 Subject: perf/core: Assert PERF_EVENT_FLAG_ARCH does not overlap with generic flags This just ensures that PERF_EVENT_FLAG_ARCH does not overlap with generic hardware event flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-3-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3e3c07512b75..f88cb31eaf75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -141,6 +141,8 @@ struct hw_perf_event_extra { #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); + /** * struct hw_perf_event - performance event hardware details: */ -- cgit v1.2.3 From 91207f62616f9f51b52436364e6d064f002e9112 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:23 +0530 Subject: arm64/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-4-anshuman.khandual@arm.com --- drivers/perf/arm_spe_pmu.c | 4 +++- include/linux/perf/arm_pmu.h | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index b65a7d9640e1..db8a0a841062 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -44,7 +44,9 @@ * This allows us to perform the check, i.e, perfmon_capable(), * in the context of the event owner, once, during the event_init(). */ -#define SPE_PMU_HW_FLAGS_CX BIT(0) +#define SPE_PMU_HW_FLAGS_CX 0x00001 + +static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX); static void set_spe_event_has_cx(struct perf_event *event) { diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0407a38b470a..0356cb6a215d 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,11 @@ /* * ARM PMU hw_event flags */ -/* Event uses a 64bit counter */ -#define ARMPMU_EVT_64BIT 1 -/* Event uses a 47bit counter */ -#define ARMPMU_EVT_47BIT 2 +#define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */ +#define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */ + +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT); +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT); #define HW_OP_UNSUPPORTED 0xFFFF #define C(_x) PERF_COUNT_HW_CACHE_##_x -- cgit v1.2.3 From f3c0eba287049237b23d1300376768293eb89e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2022 18:48:55 +0200 Subject: perf: Add a few assertions While auditing 6b959ba22d34 ("perf/core: Fix reentry problem in perf_output_read_group()") a few spots were found that wanted assertions. Notable for_each_sibling_event() relies on exclusion from modification. This would normally be holding either ctx->lock or ctx->mutex, however due to how things are constructed disabling IRQs is a valid and sufficient substitute for ctx->lock. Another possible site to add assertions would be the various pmu::{add,del,read,..}() methods, but that's not trivially expressable in C -- the best option is wrappers, but those are easy enough to forget. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 17 +++++++++++++++++ kernel/events/core.c | 2 ++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f88cb31eaf75..368bdc4f563f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,6 +61,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -634,7 +635,23 @@ struct pmu_event_list { struct list_head list; }; +/* + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex + * as such iteration must hold either lock. However, since ctx->lock is an IRQ + * safe lock, and is only held by the CPU doing the modification, having IRQs + * disabled is sufficient since it will hold-off the IPIs. + */ +#ifdef CONFIG_PROVE_LOCKING +#define lockdep_assert_event_ctx(event) \ + WARN_ON_ONCE(__lockdep_enabled && \ + (this_cpu_read(hardirqs_enabled) || \ + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) +#else +#define lockdep_assert_event_ctx(event) +#endif + #define for_each_sibling_event(sibling, event) \ + lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) diff --git a/kernel/events/core.c b/kernel/events/core.c index 00389d5f9241..3e90e454b995 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; -- cgit v1.2.3 From b4e12b2d70fd9eccdb3cef8015dc1788ca38e3fd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:04 -0700 Subject: perf: Kill __PERF_SAMPLE_CALLCHAIN_EARLY There's no in-tree user anymore. Let's get rid of it. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-3-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 10 ---------- arch/x86/events/intel/core.c | 3 --- include/uapi/linux/perf_event.h | 2 -- 3 files changed, 15 deletions(-) (limited to 'include') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index dab094166693..ce5720bfb350 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; - /* - * rip recorded by IbsOpRip will not be consistent with rsp and rbp - * recorded as part of interrupt regs. Thus we need to use rip from - * interrupt regs while unwinding call stack. Setting _EARLY flag - * makes sure we unwind call-stack before perf sample rip is set to - * IbsOpRip. - */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; - return 0; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7f4e7e6b45f0..b16c91ac9219 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3868,9 +3868,6 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index dca16582885f..e639c74cf5fb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -164,8 +164,6 @@ enum perf_event_sample_format { PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ - - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) -- cgit v1.2.3 From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:31 -0700 Subject: perf: Use sample_flags for addr Use the new sample_flags to indicate whether the addr field is filled by the PMU driver. As most PMU drivers pass 0, it can set the flag only if it has a non-zero value. And use 0 in perf_sample_output() if it's not filled already. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 8 ++++++-- kernel/events/core.c | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4ba6ab6d0d92..d2e9ff16f6ed 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1621,8 +1621,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ @@ -1783,8 +1785,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_DATA_SRC; } - if (sample_type & PERF_SAMPLE_ADDR_TYPE) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 368bdc4f563f..f4a13579b0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - u64 addr; struct perf_raw_record *raw; u64 period; @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; + u64 addr; u64 type; u64 ip; @@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = 0; - data->addr = addr; data->raw = NULL; data->period = period; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index c07e9a3ea94c..a91f74db9fe9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,11 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) data->txn = 0; + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:32 -0700 Subject: perf: Use sample_flags for raw_data Use the new sample_flags to indicate whether the raw data field is filled by the PMU driver. Although it could check with the NULL, follow the same rule with other fields. Remove the raw field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org --- arch/s390/kernel/perf_cpum_cf.c | 1 + arch/s390/kernel/perf_pai_crypto.c | 1 + arch/x86/events/amd/ibs.c | 1 + include/linux/perf_event.h | 5 ++--- kernel/events/core.c | 3 ++- 5 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..6826e2a69a21 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -366,6 +366,7 @@ static int paicrypt_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ce5720bfb350..c29a006954c7 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -781,6 +781,7 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f4a13579b0e8..e9b151cde491 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - struct perf_raw_record *raw; u64 period; /* @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_mem_data_src data_src; u64 txn; u64 addr; + struct perf_raw_record *raw; u64 type; u64 ip; @@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->sample_flags = 0; - data->raw = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; if (addr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index a91f74db9fe9..04e19a857d4b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7332,7 +7332,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7348,6 +7348,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; -- cgit v1.2.3 From ee3e88dfec23153d0675b5d00522297b9adf657c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:51 +0530 Subject: perf/mem: Introduce PERF_MEM_LVLNUM_{EXTN_MEM|IO} PERF_MEM_LVLNUM_EXTN_MEM which can be used to indicate accesses to extension memory like CXL etc. PERF_MEM_LVL_IO can be used for IO accesses but it can not distinguish between local and remote IO. Introduce new field PERF_MEM_LVLNUM_IO which can be clubbed with PERF_MEM_REMOTE_REMOTE to indicate Remote IO accesses. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-2-ravi.bangoria@amd.com --- include/uapi/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e639c74cf5fb..4ae3c249f675 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1336,7 +1336,9 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0xa available */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -- cgit v1.2.3 From cfef80bad4cf79cdc964a53c98254dfa462be83f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:57 +0530 Subject: perf/uapi: Define PERF_MEM_SNOOPX_PEER in kernel header file PERF_MEM_SNOOPX_PEER is defined only in tools uapi header. Although it's used only by perf tool, not defining it in kernel header can create problems in future. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-8-ravi.bangoria@amd.com --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4ae3c249f675..85be78e0e7f6 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1356,7 +1356,7 @@ union perf_mem_data_src { #define PERF_MEM_SNOOP_SHIFT 19 #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -/* 1 free */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ #define PERF_MEM_SNOOPX_SHIFT 38 /* locked instruction */ -- cgit v1.2.3 From 0ce38047e82a02017839b6cae837f13a1383a3a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 10:46:58 +0200 Subject: perf: Fix lockdep_assert_event_ctx() I'm a flamin' moron; because even after Mark told me it should be '&&' I still got it wrong in the final commit. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Borislav Petkov Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Tested-by: Borislav Petkov Link: https://lkml.kernel.org/r/YvvIWmDBWdIUCMZj@FVFF77S0Q05N --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e9b151cde491..853f64b6c8c2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -644,7 +644,7 @@ struct pmu_event_list { #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ - (this_cpu_read(hardirqs_enabled) || \ + (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) -- cgit v1.2.3