From efe34e99fc41ad2b3fd3bbe79ecd3906620ec961 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Mon, 28 Mar 2022 15:32:25 -0700 Subject: tracing/user_events: Hold event_mutex during dyn_event_add Make sure the event_mutex is properly held during dyn_event_add call. This is required when adding dynamic events. Link: https://lkml.kernel.org/r/20220328223225.1992-1-beaub@linux.microsoft.com Reported-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 8b3d241a31c2..61d78d64bdf0 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1165,11 +1165,11 @@ static int user_event_parse(char *name, char *args, char *flags, #endif mutex_lock(&event_mutex); + ret = user_event_trace_register(user); - mutex_unlock(&event_mutex); if (ret) - goto put_user; + goto put_user_lock; user->index = index; @@ -1181,8 +1181,12 @@ static int user_event_parse(char *name, char *args, char *flags, set_bit(user->index, page_bitmap); hash_add(register_table, &user->node, key); + mutex_unlock(&event_mutex); + *newuser = user; return 0; +put_user_lock: + mutex_unlock(&event_mutex); put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); -- cgit v1.2.3 From 768c1e7f1de03afd0b55e0e951efc272309eeb52 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 29 Mar 2022 10:30:51 -0700 Subject: tracing/user_events: Remove eBPF interfaces Remove eBPF interfaces within user_events to ensure they are fully reviewed. Link: https://lore.kernel.org/all/20220329165718.GA10381@kbox/ Link: https://lkml.kernel.org/r/20220329173051.10087-1-beaub@linux.microsoft.com Suggested-by: Alexei Starovoitov Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/user_events.rst | 14 ++----- include/uapi/linux/user_events.h | 53 --------------------------- kernel/trace/trace_events_user.c | 73 +------------------------------------ 3 files changed, 4 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index bddedabaca80..c180936f49fc 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -7,7 +7,7 @@ user_events: User-based Event Tracing Overview -------- User based trace events allow user processes to create events and trace data -that can be viewed via existing tools, such as ftrace, perf and eBPF. +that can be viewed via existing tools, such as ftrace and perf. To enable this feature, build your kernel with CONFIG_USER_EVENTS=y. Programs can view status of the events via @@ -67,8 +67,7 @@ The command string format is as follows:: Supported Flags ^^^^^^^^^^^^^^^ -**BPF_ITER** - EBPF programs attached to this event will get the raw iovec -struct instead of any data copies for max performance. +None yet Field Format ^^^^^^^^^^^^ @@ -160,7 +159,7 @@ The following values are defined to aid in checking what has been attached: **EVENT_STATUS_FTRACE** - Bit set if ftrace has been attached (Bit 0). -**EVENT_STATUS_PERF** - Bit set if perf/eBPF has been attached (Bit 1). +**EVENT_STATUS_PERF** - Bit set if perf has been attached (Bit 1). Writing Data ------------ @@ -204,13 +203,6 @@ It's advised for user programs to do the following:: **NOTE:** *The write_index is not emitted out into the trace being recorded.* -EBPF ----- -EBPF programs that attach to a user-based event tracepoint are given a pointer -to a struct user_bpf_context. The bpf context contains the data type (which can -be a user or kernel buffer, or can be a pointer to the iovec) and the data -length that was emitted (minus the write_index). - Example Code ------------ See sample code in samples/user_events. diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index e570840571e1..736e05603463 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -32,9 +32,6 @@ /* Create dynamic location entry within a 32-bit value */ #define DYN_LOC(offset, size) ((size) << 16 | (offset)) -/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */ -#define FLAG_BPF_ITER (1 << 0) - /* * Describes an event registration and stores the results of the registration. * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum @@ -63,54 +60,4 @@ struct user_reg { /* Requests to delete a user_event */ #define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*) -/* Data type that was passed to the BPF program */ -enum { - /* Data resides in kernel space */ - USER_BPF_DATA_KERNEL, - - /* Data resides in user space */ - USER_BPF_DATA_USER, - - /* Data is a pointer to a user_bpf_iter structure */ - USER_BPF_DATA_ITER, -}; - -/* - * Describes an iovec iterator that BPF programs can use to access data for - * a given user_event write() / writev() call. - */ -struct user_bpf_iter { - - /* Offset of the data within the first iovec */ - __u32 iov_offset; - - /* Number of iovec structures */ - __u32 nr_segs; - - /* Pointer to iovec structures */ - const struct iovec *iov; -}; - -/* Context that BPF programs receive when attached to a user_event */ -struct user_bpf_context { - - /* Data type being passed (see union below) */ - __u32 data_type; - - /* Length of the data */ - __u32 data_len; - - /* Pointer to data, varies by data type */ - union { - /* Kernel data (data_type == USER_BPF_DATA_KERNEL) */ - void *kdata; - - /* User data (data_type == USER_BPF_DATA_USER) */ - void *udata; - - /* Direct iovec (data_type == USER_BPF_DATA_ITER) */ - struct user_bpf_iter *iter; - }; -}; - #endif /* _UAPI_LINUX_USER_EVENTS_H */ diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 61d78d64bdf0..846c27bc7aef 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -42,9 +42,6 @@ #define MAX_FIELD_ARRAY_SIZE 1024 #define MAX_FIELD_ARG_NAME 256 -#define MAX_BPF_COPY_SIZE PAGE_SIZE -#define MAX_STACK_BPF_DATA 512 - static char *register_page_data; static DEFINE_MUTEX(reg_mutex); @@ -405,19 +402,6 @@ parse: type[0] != 'u', FILTER_OTHER); } -static void user_event_parse_flags(struct user_event *user, char *flags) -{ - char *flag; - - if (flags == NULL) - return; - - while ((flag = strsep(&flags, ",")) != NULL) { - if (strcmp(flag, "BPF_ITER") == 0) - user->flags |= FLAG_BPF_ITER; - } -} - static int user_event_parse_fields(struct user_event *user, char *args) { char *field; @@ -713,64 +697,14 @@ discard: } #ifdef CONFIG_PERF_EVENTS -static void user_event_bpf(struct user_event *user, struct iov_iter *i) -{ - struct user_bpf_context context; - struct user_bpf_iter bpf_i; - char fast_data[MAX_STACK_BPF_DATA]; - void *temp = NULL; - - if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) { - /* Raw iterator */ - context.data_type = USER_BPF_DATA_ITER; - context.data_len = i->count; - context.iter = &bpf_i; - - bpf_i.iov_offset = i->iov_offset; - bpf_i.iov = i->iov; - bpf_i.nr_segs = i->nr_segs; - } else if (i->nr_segs == 1 && iter_is_iovec(i)) { - /* Single buffer from user */ - context.data_type = USER_BPF_DATA_USER; - context.data_len = i->count; - context.udata = i->iov->iov_base + i->iov_offset; - } else { - /* Multi buffer from user */ - struct iov_iter copy = *i; - size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE); - - context.data_type = USER_BPF_DATA_KERNEL; - context.kdata = fast_data; - - if (unlikely(copy_size > sizeof(fast_data))) { - temp = kmalloc(copy_size, GFP_NOWAIT); - - if (temp) - context.kdata = temp; - else - copy_size = sizeof(fast_data); - } - - context.data_len = copy_nofault(context.kdata, - copy_size, ©); - } - - trace_call_bpf(&user->call, &context); - - kfree(temp); -} - /* - * Writes the user supplied payload out to perf ring buffer or eBPF program. + * Writes the user supplied payload out to perf ring buffer. */ static void user_event_perf(struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted) { struct hlist_head *perf_head; - if (bpf_prog_array_valid(&user->call)) - user_event_bpf(user, i); - perf_head = this_cpu_ptr(user->call.perf_events); if (perf_head && !hlist_empty(perf_head)) { @@ -1136,8 +1070,6 @@ static int user_event_parse(char *name, char *args, char *flags, user->tracepoint.name = name; - user_event_parse_flags(user, flags); - ret = user_event_parse_fields(user, args); if (ret) @@ -1579,9 +1511,6 @@ static int user_seq_show(struct seq_file *m, void *p) busy++; } - if (flags & FLAG_BPF_ITER) - seq_puts(m, " FLAG:BPF_ITER"); - seq_puts(m, "\n"); active++; } -- cgit v1.2.3 From fcbf591cedbd44d82d7765b3c7e261858108593f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 30 Mar 2022 15:58:35 -0400 Subject: tracing: Set user_events to BROKEN After being merged, user_events become more visible to a wider audience that have concerns with the current API. It is too late to fix this for this release, but instead of a full revert, just mark it as BROKEN (which prevents it from being selected in make config). Then we can work finding a better API. If that fails, then it will need to be completely reverted. Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Link: https://lkml.kernel.org/r/20220330155835.5e1f6669@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 16a52a71732d..f83d29adb375 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -741,6 +741,7 @@ config USER_EVENTS bool "User trace events" select TRACING select DYNAMIC_EVENTS + depends on BROKEN || COMPILE_TEST # API needs to be straighten out help User trace events are user-defined trace events that can be used like an existing kernel trace event. User trace -- cgit v1.2.3 From 18bfee3216fa6f28d55ebf88d824a539d2bec3c7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 30 Mar 2022 09:00:19 +0200 Subject: ftrace: Make ftrace_graph_is_dead() a static branch ftrace_graph_is_dead() is used on hot paths, it just reads a variable in memory and is not worth suffering function call constraints. For instance, at entry of prepare_ftrace_return(), inlining it avoids saving prepare_ftrace_return() parameters to stack and restoring them after calling ftrace_graph_is_dead(). While at it using a static branch is even more performant and is rather well adapted considering that the returned value will almost never change. Inline ftrace_graph_is_dead() and replace 'kill_ftrace_graph' bool by a static branch. The performance improvement is noticeable. Link: https://lkml.kernel.org/r/e0411a6a0ed3eafff0ad2bc9cd4b0e202b4617df.1648623570.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 16 +++++++++++++++- kernel/trace/fgraph.c | 17 +++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 37b619185ec9..f15a4b76cbfc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -1015,7 +1016,20 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, extern int register_ftrace_graph(struct fgraph_ops *ops); extern void unregister_ftrace_graph(struct fgraph_ops *ops); -extern bool ftrace_graph_is_dead(void); +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +DECLARE_STATIC_KEY_FALSE(kill_ftrace_graph); + +static inline bool ftrace_graph_is_dead(void) +{ + return static_branch_unlikely(&kill_ftrace_graph); +} + extern void ftrace_graph_stop(void); /* The current handlers in use */ diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 19028e072cdb..8f4fb328133a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -7,6 +7,7 @@ * * Highly modified by Steven Rostedt (VMware). */ +#include #include #include #include @@ -23,24 +24,12 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif -static bool kill_ftrace_graph; +DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph); int ftrace_graph_active; /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; -/** - * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called - * - * ftrace_graph_stop() is called when a severe error is detected in - * the function graph tracing. This function is called by the critical - * paths of function graph to keep those paths from doing any more harm. - */ -bool ftrace_graph_is_dead(void) -{ - return kill_ftrace_graph; -} - /** * ftrace_graph_stop - set to permanently disable function graph tracing * @@ -51,7 +40,7 @@ bool ftrace_graph_is_dead(void) */ void ftrace_graph_stop(void) { - kill_ftrace_graph = true; + static_branch_enable(&kill_ftrace_graph); } /* Add a function return address to the trace stack on thread info.*/ -- cgit v1.2.3 From 5cfff569cab8bf544bab62c911c5d6efd5af5e05 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 1 Apr 2022 14:39:03 -0400 Subject: tracing: Move user_events.h temporarily out of include/uapi While user_events API is under development and has been marked for broken to not let the API become fixed, move the header file out of the uapi directory. This is to prevent it from being installed, then later changed, and then have an old distro user space update with a new kernel, where applications see the user_events being available, but the old header is in place, and then they get compiled incorrectly. Also, surround the include with CONFIG_COMPILE_TEST to the current location, but when the BROKEN tag is taken off, it will use the uapi directory, and fail to compile. This is a good way to remind us to move the header back. Link: https://lore.kernel.org/all/20220330155835.5e1f6669@gandalf.local.home Link: https://lkml.kernel.org/r/20220330201755.29319-1-mathieu.desnoyers@efficios.com Link: https://lkml.kernel.org/r/20220401143903.188384f3@gandalf.local.home Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 63 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/user_events.h | 63 ---------------------------------------- kernel/trace/trace_events_user.c | 5 ++++ 3 files changed, 68 insertions(+), 63 deletions(-) create mode 100644 include/linux/user_events.h delete mode 100644 include/uapi/linux/user_events.h (limited to 'kernel') diff --git a/include/linux/user_events.h b/include/linux/user_events.h new file mode 100644 index 000000000000..736e05603463 --- /dev/null +++ b/include/linux/user_events.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave + */ +#ifndef _UAPI_LINUX_USER_EVENTS_H +#define _UAPI_LINUX_USER_EVENTS_H + +#include +#include + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_PREFIX "u:" + +/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ +#define EVENT_BIT_FTRACE 0 +#define EVENT_BIT_PERF 1 +#define EVENT_BIT_OTHER 7 + +#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) +#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) +#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) + +/* Create dynamic location entry within a 32-bit value */ +#define DYN_LOC(offset, size) ((size) << 16 | (offset)) + +/* + * Describes an event registration and stores the results of the registration. + * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum + * must set the size and name_args before invocation. + */ +struct user_reg { + + /* Input: Size of the user_reg structure being used */ + __u32 size; + + /* Input: Pointer to string with event name, description and flags */ + __u64 name_args; + + /* Output: Byte index of the event within the status page */ + __u32 status_index; + + /* Output: Index of the event to use when writing data */ + __u32 write_index; +}; + +#define DIAG_IOC_MAGIC '*' + +/* Requests to register a user_event */ +#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*) + +/* Requests to delete a user_event */ +#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*) + +#endif /* _UAPI_LINUX_USER_EVENTS_H */ diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h deleted file mode 100644 index 736e05603463..000000000000 --- a/include/uapi/linux/user_events.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (c) 2021, Microsoft Corporation. - * - * Authors: - * Beau Belgrave - */ -#ifndef _UAPI_LINUX_USER_EVENTS_H -#define _UAPI_LINUX_USER_EVENTS_H - -#include -#include - -#ifdef __KERNEL__ -#include -#else -#include -#endif - -#define USER_EVENTS_SYSTEM "user_events" -#define USER_EVENTS_PREFIX "u:" - -/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ -#define EVENT_BIT_FTRACE 0 -#define EVENT_BIT_PERF 1 -#define EVENT_BIT_OTHER 7 - -#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) -#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) -#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) - -/* Create dynamic location entry within a 32-bit value */ -#define DYN_LOC(offset, size) ((size) << 16 | (offset)) - -/* - * Describes an event registration and stores the results of the registration. - * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum - * must set the size and name_args before invocation. - */ -struct user_reg { - - /* Input: Size of the user_reg structure being used */ - __u32 size; - - /* Input: Pointer to string with event name, description and flags */ - __u64 name_args; - - /* Output: Byte index of the event within the status page */ - __u32 status_index; - - /* Output: Index of the event to use when writing data */ - __u32 write_index; -}; - -#define DIAG_IOC_MAGIC '*' - -/* Requests to register a user_event */ -#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*) - -/* Requests to delete a user_event */ -#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*) - -#endif /* _UAPI_LINUX_USER_EVENTS_H */ diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 846c27bc7aef..706e1686b5eb 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -18,7 +18,12 @@ #include #include #include +/* Reminder to move to uapi when everything works */ +#ifdef CONFIG_COMPILE_TEST +#include +#else #include +#endif #include "trace.h" #include "trace_dynevent.h" -- cgit v1.2.3