diff options
33 files changed, 861 insertions, 411 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a5f4004e8705..f0461456d910 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2011,6 +2011,19 @@ Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, the default is off. + kprobe_event=[probe-list] + [FTRACE] Add kprobe events and enable at boot time. + The probe-list is a semicolon delimited list of probe + definitions. Each definition is same as kprobe_events + interface, but the parameters are comma delimited. + For example, to add a kprobe event on vfs_read with + arg1 and arg2, add to the command line; + + kprobe_event=p,vfs_read,$arg1,$arg2 + + See also Documentation/trace/kprobetrace.rst "Kernel + Boot Parameter" section. + kpti= [ARM64] Control page table isolation of user and kernel address spaces. Default: enabled on cores which need mitigation. diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 7d2b0178d3f3..fbb314bfa112 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -51,15 +51,17 @@ Synopsis of kprobe_events $argN : Fetch the Nth function argument. (N >= 1) (\*1) $retval : Fetch return value.(\*2) $comm : Fetch current task comm. - +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(\*3) + +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types - (x8/x16/x32/x64), "string" and bitfield are supported. + (x8/x16/x32/x64), "string", "ustring" and bitfield + are supported. (\*1) only for the probe on function entry (offs == 0). (\*2) only for return probe. (\*3) this is useful for fetching a field of data structures. + (\*4) "u" means user-space dereference. See :ref:`user_mem_access`. Types ----- @@ -77,7 +79,8 @@ apply it to registers/stack-entries etc. (for example, '$stack1:x8[8]' is wrong, but '+8($stack):x8[8]' is OK.) String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container -has been paged out. +has been paged out. "ustring" type is an alternative of string for user-space. +See :ref:`user_mem_access` for more info.. The string array type is a bit different from other types. For other base types, <base-type>[1] is equal to <base-type> (e.g. +0(%di):x32[1] is same as +0(%di):x32.) But string[1] is not equal to string. The string type itself @@ -92,6 +95,25 @@ Symbol type('symbol') is an alias of u32 or u64 type (depends on BITS_PER_LONG) which shows given pointer in "symbol+offset" style. For $comm, the default type is "string"; any other type is invalid. +.. _user_mem_access: +User Memory Access +------------------ +Kprobe events supports user-space memory access. For that purpose, you can use +either user-space dereference syntax or 'ustring' type. + +The user-space dereference syntax allows you to access a field of a data +structure in user-space. This is done by adding the "u" prefix to the +dereference syntax. For example, +u4(%si) means it will read memory from the +address in the register %si offset by 4, and the memory is expected to be in +user-space. You can use this for strings too, e.g. +u0(%si):string will read +a string from the address in the register %si that is expected to be in user- +space. 'ustring' is a shortcut way of performing the same task. That is, ++0(%si):ustring is equivalent to +u0(%si):string. + +Note that kprobe-event provides the user-memory access syntax but it doesn't +use it transparently. This means if you use normal dereference or string type +for user memory, it might fail, and may always fail on some archs. The user +has to carefully check if the target data is in kernel or user space. Per-Probe Event Filtering ------------------------- @@ -124,6 +146,20 @@ You can check the total number of probe hits and probe miss-hits via The first column is event name, the second is the number of probe hits, the third is the number of probe miss-hits. +Kernel Boot Parameter +--------------------- +You can add and enable new kprobe events when booting up the kernel by +"kprobe_event=" parameter. The parameter accepts a semicolon-delimited +kprobe events, which format is similar to the kprobe_events. +The difference is that the probe definition parameters are comma-delimited +instead of space. For example, adding myprobe event on do_sys_open like below + + p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack) + +should be below for kernel boot parameter (just replace spaces with comma) + + p:myprobe,do_sys_open,dfd=%ax,filename=%dx,flags=%cx,mode=+4($stack) + Usage examples -------------- diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index 0b21305fabdc..6e75a6c5a2c8 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -42,16 +42,18 @@ Synopsis of uprobe_tracer @+OFFSET : Fetch memory at OFFSET (OFFSET from same file as PATH) $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. - $retval : Fetch return value.(*) + $retval : Fetch return value.(\*1) $comm : Fetch current task comm. - +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) + +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*2)(\*3) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types (x8/x16/x32/x64), "string" and bitfield are supported. - (*) only for return probe. - (**) this is useful for fetching a field of data structures. + (\*1) only for return probe. + (\*2) this is useful for fetching a field of data structures. + (\*3) Unlike kprobe event, "u" prefix will just be ignored, becuse uprobe + events can access only user-space memory. Types ----- diff --git a/arch/Kconfig b/arch/Kconfig index e8d19c3cb91f..6dd1faab6ccb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -128,22 +128,6 @@ config UPROBES managed by the kernel and kept transparent to the probed application. ) -config HAVE_64BIT_ALIGNED_ACCESS - def_bool 64BIT && !HAVE_EFFICIENT_UNALIGNED_ACCESS - help - Some architectures require 64 bit accesses to be 64 bit - aligned, which also requires structs containing 64 bit values - to be 64 bit aligned too. This includes some 32 bit - architectures which can do 64 bit accesses, as well as 64 bit - architectures without unaligned access. - - This symbol should be selected by an architecture if 64 bit - accesses are required to be 64 bit aligned in this way even - though it is not a 64 bit architecture. - - See Documentation/unaligned-memory-access.txt for more - information on the topic of unaligned memory accesses. - config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c82abd6e4ca3..9c4435307ff8 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un }) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +static inline bool pagefault_disabled(void); +# define WARN_ON_IN_IRQ() \ + WARN_ON_ONCE(!in_task() && !pagefault_disabled()) #else # define WARN_ON_IN_IRQ() #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4b73f5937f41..024c3053dbba 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -373,7 +373,7 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } -static int add_breakpoints(struct dyn_ftrace *rec, int enable) +static int add_breakpoints(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_addr; int ret; @@ -481,7 +481,7 @@ static int add_update_nop(struct dyn_ftrace *rec) return add_update_code(ip, new); } -static int add_update(struct dyn_ftrace *rec, int enable) +static int add_update(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_addr; int ret; @@ -527,7 +527,7 @@ static int finish_update_nop(struct dyn_ftrace *rec) return ftrace_write(ip, new, 1); } -static int finish_update(struct dyn_ftrace *rec, int enable) +static int finish_update(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_addr; int ret; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 25e2995d4a4c..8a8cb3c401b2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); iter = ftrace_rec_iter_next(iter)) -int ftrace_update_record(struct dyn_ftrace *rec, int enable); -int ftrace_test_record(struct dyn_ftrace *rec, int enable); +int ftrace_update_record(struct dyn_ftrace *rec, bool enable); +int ftrace_test_record(struct dyn_ftrace *rec, bool enable); void ftrace_run_stop_machine(int command); unsigned long ftrace_location(unsigned long ip); unsigned long ftrace_location_range(unsigned long start, unsigned long end); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8a62731673f7..5150436783e8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -142,6 +142,7 @@ enum print_line_t { enum print_line_t trace_handle_return(struct trace_seq *s); void tracing_generic_entry_update(struct trace_entry *entry, + unsigned short type, unsigned long flags, int pc); struct trace_event_file; @@ -317,6 +318,14 @@ trace_event_name(struct trace_event_call *call) return call->name; } +static inline struct list_head * +trace_get_fields(struct trace_event_call *event_call) +{ + if (!event_call->class->get_fields) + return &event_call->class->fields; + return event_call->class->get_fields(event_call); +} + struct trace_array; struct trace_subsystem_dir; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2b70130af585..34a038563d97 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -203,7 +203,10 @@ static inline void pagefault_enable(void) /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ -#define pagefault_disabled() (current->pagefault_disabled != 0) +static inline bool pagefault_disabled(void) +{ + return current->pagefault_disabled != 0; +} /* * The pagefault handler is in general disabled by pagefault_disable() or @@ -240,6 +243,18 @@ extern long probe_kernel_read(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size); /* + * probe_user_read(): safely attempt to read from a location in user space + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +extern long probe_user_read(void *dst, const void __user *src, size_t size); +extern long __probe_user_read(void *dst, const void __user *src, size_t size); + +/* * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written @@ -252,6 +267,9 @@ extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, + long count); +extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); /** * probe_kernel_address(): safely attempt to read from a location diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f5433a52488..9873fc627d61 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2276,6 +2276,7 @@ static int __init init_kprobes(void) init_test_probes(); return err; } +subsys_initcall(init_kprobes); #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, @@ -2588,5 +2589,3 @@ static int __init debugfs_kprobe_init(void) late_initcall(debugfs_kprobe_init); #endif /* CONFIG_DEBUG_FS */ - -module_init(init_kprobes); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 564e5fdb025f..98da8998c25c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -597,9 +597,19 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_STARTUP_TEST + bool "Run selftest on trace events" + depends on FTRACE_STARTUP_TEST + default y + help + This option performs a test on all trace events in the system. + It basically just enables each event and runs some code that + will trigger events (not necessarily the event it enables) + This may take some time run as there are a lot of events. + config EVENT_TRACE_TEST_SYSCALLS bool "Run selftest on syscall events" - depends on FTRACE_STARTUP_TEST + depends on EVENT_TRACE_STARTUP_TEST help This option will also enable testing every syscall event. It only enables the event and disables it and runs various loads diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 576c41644e77..eca34503f178 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1622,6 +1622,11 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1750,15 +1755,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, } /* - * If the rec had TRAMP enabled, then it needs to - * be cleared. As TRAMP can only be enabled iff - * there is only a single ops attached to it. - * In otherwords, always disable it on decrementing. - * In the future, we may set it if rec count is - * decremented to one, and the ops that is left - * has a trampoline. + * The TRAMP needs to be set only if rec count + * is decremented to one, and the ops that is + * left has a trampoline. As TRAMP can only be + * enabled if there is only a single ops attached + * to it. */ - rec->flags &= ~FTRACE_FL_TRAMP; + if (ftrace_rec_count(rec) == 1 && + ftrace_find_tramp_ops_any(rec)) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; /* * flags will be cleared in ftrace_check_record() @@ -1768,7 +1775,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, count++; /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ - update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE; /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) @@ -1951,11 +1958,6 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static struct ftrace_ops * -ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); -static struct ftrace_ops * -ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); - enum ftrace_bug_type ftrace_bug_type; const void *ftrace_expected; @@ -2047,7 +2049,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } } -static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) { unsigned long flag = 0UL; @@ -2146,28 +2148,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /** * ftrace_update_record, set a record that now is tracing or not * @rec: the record to update - * @enable: set to 1 if the record is tracing, zero to force disable + * @enable: set to true if the record is tracing, false to force disable * * The records that represent all functions that can be traced need * to be updated when tracing has been enabled. */ -int ftrace_update_record(struct dyn_ftrace *rec, int enable) +int ftrace_update_record(struct dyn_ftrace *rec, bool enable) { - return ftrace_check_record(rec, enable, 1); + return ftrace_check_record(rec, enable, true); } /** * ftrace_test_record, check if the record has been enabled or not * @rec: the record to test - * @enable: set to 1 to check if enabled, 0 if it is disabled + * @enable: set to true to check if enabled, false if it is disabled * * The arch code may need to test if a record is already set to * tracing to determine how to modify the function code that it * represents. */ -int ftrace_test_record(struct dyn_ftrace *rec, int enable) +int ftrace_test_record(struct dyn_ftrace *rec, bool enable) { - return ftrace_check_record(rec, enable, 0); + return ftrace_check_record(rec, enable, false); } static struct ftrace_ops * @@ -2356,7 +2358,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) } static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2395,7 +2397,7 @@ void __weak ftrace_replace_code(int mod_flags) { struct dyn_ftrace *rec; struct ftrace_page *pg; - int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; int failed; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05b0b3139ebc..66358d66c933 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -128,16 +128,7 @@ int ring_buffer_print_entry_header(struct trace_seq *s) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) +#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -2373,7 +2364,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + if (length > RB_MAX_SMALL_DATA) { event->type_len = 0; event->array[0] = length; } else @@ -2388,11 +2379,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length++; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + if (length > RB_MAX_SMALL_DATA) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); + length = ALIGN(length, RB_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c90c687cf950..525a97fbbc60 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -366,7 +366,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list + * trace_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -743,8 +743,7 @@ trace_event_setup(struct ring_buffer_event *event, { struct trace_entry *ent = ring_buffer_event_data(event); - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; + tracing_generic_entry_update(ent, type, flags, pc); } static __always_inline struct ring_buffer_event * @@ -2312,13 +2311,14 @@ enum print_line_t trace_handle_return(struct trace_seq *s) EXPORT_SYMBOL_GPL(trace_handle_return); void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, - int pc) +tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, + unsigned long flags, int pc) { struct task_struct *tsk = current; entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->type = type; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -4842,12 +4842,13 @@ static const char readme_msg[] = "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - "\t $stack<index>, $stack, $retval, $comm, $arg<N>\n" + "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #else - "\t $stack<index>, $stack, $retval, $comm\n" + "\t $stack<index>, $stack, $retval, $comm,\n" #endif + "\t +|-[u]<offset>(<fetcharg>)\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" - "\t b<bit-width>@<bit-offset>/<container-size>,\n" + "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" "\t <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4629a6104474..0892e38ed6fb 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -416,8 +416,7 @@ void perf_trace_buf_update(void *record, u16 type) unsigned long flags; local_save_flags(flags); - tracing_generic_entry_update(entry, flags, pc); - entry->type = type; + tracing_generic_entry_update(entry, type, flags, pc); } NOKPROBE_SYMBOL(perf_trace_buf_update); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0ce3db67f556..c7506bc81b75 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -70,14 +70,6 @@ static int system_refcount_dec(struct event_subsystem *system) #define while_for_each_event_file() \ } -static struct list_head * -trace_get_fields(struct trace_event_call *event_call) -{ - if (!event_call->class->get_fields) - return &event_call->class->fields; - return event_call->class->get_fields(event_call); -} - static struct ftrace_event_field * __find_event_field(struct list_head *head, char *name) { @@ -3190,7 +3182,7 @@ void __init trace_event_init(void) event_trace_enable(); } -#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); static DEFINE_SPINLOCK(test_spinlock_irq); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5079d1db3754..c773b8fb270c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1084,6 +1084,9 @@ int filter_assign_type(const char *type) if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; + if (strcmp(type, "char *") == 0 || strcmp(type, "const char *") == 0) + return FILTER_PTR_STRING; + return FILTER_OTHER; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7d736248a070..9d483ad9bb6c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -12,6 +12,8 @@ #include <linux/rculist.h> #include <linux/error-injection.h> +#include <asm/setup.h> /* for COMMAND_LINE_SIZE */ + #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" @@ -19,6 +21,18 @@ #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 +#define MAX_KPROBE_CMDLINE_SIZE 1024 + +/* Kprobe early definition from command line */ +static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; +static bool kprobe_boot_events_enabled __initdata; + +static int __init set_kprobe_boot_events(char *str) +{ + strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + return 0; +} +__setup("kprobe_event=", set_kprobe_boot_events); static int trace_kprobe_create(int argc, const char **argv); static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -128,8 +142,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_event_name(&tk->tp.call), event) == 0 && - (!system || strcmp(tk->tp.call.class->system, system) == 0); + return strcmp(trace_probe_name(&tk->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0); } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -143,6 +157,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +static nokprobe_inline bool trace_kprobe_is_registered(struct trace_kprobe *tk) +{ + return !(list_empty(&tk->rp.kp.list) && + hlist_unhashed(&tk->rp.kp.hlist)); +} + /* Return 0 if it fails to find the symbol address */ static nokprobe_inline unsigned long trace_kprobe_address(struct trace_kprobe *tk) @@ -183,6 +203,16 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +static void free_trace_kprobe(struct trace_kprobe *tk) +{ + if (tk) { + trace_probe_cleanup(&tk->tp); + kfree(tk->symbol); + free_percpu(tk->nhit); + kfree(tk); + } +} + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -220,49 +250,20 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.kp.pre_handler = kprobe_dispatcher; tk->rp.maxactive = maxactive; + INIT_HLIST_NODE(&tk->rp.kp.hlist); + INIT_LIST_HEAD(&tk->rp.kp.list); - if (!event || !group) { - ret = -EINVAL; - goto error; - } - - tk->tp.call.class = &tk->tp.class; - tk->tp.call.name = kstrdup(event, GFP_KERNEL); - if (!tk->tp.call.name) - goto error; - - tk->tp.class.system = kstrdup(group, GFP_KERNEL); - if (!tk->tp.class.system) + ret = trace_probe_init(&tk->tp, event, group); + if (ret < 0) goto error; dyn_event_init(&tk->devent, &trace_kprobe_ops); - INIT_LIST_HEAD(&tk->tp.files); return tk; error: - kfree(tk->tp.call.name); - kfree(tk->symbol); - free_percpu(tk->nhit); - kfree(tk); + free_trace_kprobe(tk); return ERR_PTR(ret); } -static void free_trace_kprobe(struct trace_kprobe *tk) -{ - int i; - - if (!tk) - return; - - for (i = 0; i < tk->tp.nr_args; i++) - traceprobe_free_probe_arg(&tk->tp.args[i]); - - kfree(tk->tp.call.class->system); - kfree(tk->tp.call.name); - kfree(tk->symbol); - free_percpu(tk->nhit); - kfree(tk); -} - static struct trace_kprobe *find_trace_kprobe(const char *event, const char *group) { @@ -270,8 +271,8 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; for_each_trace_kprobe(tk, pos) - if (strcmp(trace_event_name(&tk->tp.call), event) == 0 && - strcmp(tk->tp.call.class->system, group) == 0) + if (strcmp(trace_probe_name(&tk->tp), event) == 0 && + strcmp(trace_probe_group_name(&tk->tp), group) == 0) return tk; return NULL; } @@ -280,7 +281,7 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) { int ret = 0; - if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_registered(tk) && !trace_kprobe_has_gone(tk)) { if (trace_kprobe_is_return(tk)) ret = enable_kretprobe(&tk->rp); else @@ -297,34 +298,27 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link; + bool enabled = trace_probe_is_enabled(&tk->tp); int ret = 0; if (file) { - link = kmalloc(sizeof(*link), GFP_KERNEL); - if (!link) { - ret = -ENOMEM; - goto out; - } - - link->file = file; - list_add_tail_rcu(&link->list, &tk->tp.files); + ret = trace_probe_add_file(&tk->tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE); - tk->tp.flags |= TP_FLAG_TRACE; - ret = __enable_trace_kprobe(tk); - if (ret) { - list_del_rcu(&link->list); - kfree(link); - tk->tp.flags &= ~TP_FLAG_TRACE; - } + if (enabled) + return 0; - } else { - tk->tp.flags |= TP_FLAG_PROFILE; - ret = __enable_trace_kprobe(tk); - if (ret) - tk->tp.flags &= ~TP_FLAG_PROFILE; + ret = __enable_trace_kprobe(tk); + if (ret) { + if (file) + trace_probe_remove_file(&tk->tp, file); + else + trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE); } - out: + return ret; } @@ -335,54 +329,34 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) static int disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link = NULL; - int wait = 0; + struct trace_probe *tp = &tk->tp; int ret = 0; if (file) { - link = find_event_file_link(&tk->tp, file); - if (!link) { - ret = -EINVAL; - goto out; - } - - list_del_rcu(&link->list); - wait = 1; - if (!list_empty(&tk->tp.files)) + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) goto out; - - tk->tp.flags &= ~TP_FLAG_TRACE; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); } else - tk->tp.flags &= ~TP_FLAG_PROFILE; + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) { if (trace_kprobe_is_return(tk)) disable_kretprobe(&tk->rp); else disable_kprobe(&tk->rp.kp); - wait = 1; } - /* - * if tk is not added to any list, it must be a local trace_kprobe - * created with perf_event_open. We don't need to wait for these - * trace_kprobes - */ - if (list_empty(&tk->devent.list)) - wait = 0; out: - if (wait) { + if (file) /* - * Synchronize with kprobe_trace_func/kretprobe_trace_func - * to ensure disabled (all running handlers are finished). - * This is not only for kfree(), but also the caller, - * trace_remove_event_call() supposes it for releasing - * event_call related objects, which will be accessed in - * the kprobe_trace_func/kretprobe_trace_func. + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. */ - synchronize_rcu(); - kfree(link); /* Ignored if link == NULL */ - } + trace_probe_remove_file(tp, file); return ret; } @@ -415,7 +389,7 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; - if (trace_probe_is_registered(&tk->tp)) + if (trace_kprobe_is_registered(tk)) return -EINVAL; if (within_notrace_func(tk)) { @@ -441,21 +415,20 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) - tk->tp.flags |= TP_FLAG_REGISTERED; return ret; } /* Internal unregister function - just handle k*probes and flags */ static void __unregister_trace_kprobe(struct trace_kprobe *tk) { - if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_registered(tk)) { if (trace_kprobe_is_return(tk)) unregister_kretprobe(&tk->rp); else unregister_kprobe(&tk->rp.kp); - tk->tp.flags &= ~TP_FLAG_REGISTERED; - /* Cleanup kprobe for reuse */ + /* Cleanup kprobe for reuse and mark it unregistered */ + INIT_HLIST_NODE(&tk->rp.kp.hlist); + INIT_LIST_HEAD(&tk->rp.kp.list); if (tk->rp.kp.symbol_name) tk->rp.kp.addr = NULL; } @@ -487,8 +460,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&event_mutex); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call), - tk->tp.call.class->system); + old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), + trace_probe_group_name(&tk->tp)); if (old_tk) { ret = unregister_trace_kprobe(old_tk); if (ret < 0) @@ -541,7 +514,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, ret = __register_trace_kprobe(tk); if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", - trace_event_name(&tk->tp.call), + trace_probe_name(&tk->tp), mod->name, ret); } } @@ -716,6 +689,10 @@ static int trace_kprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } + ret = traceprobe_set_print_fmt(&tk->tp, is_return); + if (ret < 0) + goto error; + ret = register_trace_kprobe(tk); if (ret) { trace_probe_log_set_index(1); @@ -767,8 +744,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) int i; seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tk->tp.call.class->system, - trace_event_name(&tk->tp.call)); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp), + trace_probe_name(&tk->tp)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -842,7 +819,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) tk = to_trace_kprobe(ev); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), + trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), tk->rp.kp.nmissed); @@ -886,6 +863,15 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE); +} + /* * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max * length and relative data location. @@ -894,19 +880,46 @@ static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { int maxlen = get_loc_len(*(u32 *)dest); - u8 *dst = get_loc_data(dest, base); + void *__dest; long ret; if (unlikely(!maxlen)) return -ENOMEM; + + __dest = get_loc_data(dest, base); + /* * Try to get string again, since the string can be changed while * probing. */ - ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); + ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen); if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + *(u32 *)dest = make_data_loc(ret, __dest - base); + return ret; } @@ -916,6 +929,14 @@ probe_mem_read(void *dest, void *src, size_t size) return probe_kernel_read(dest, src, size); } +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return probe_user_read(dest, uaddr, size); +} + /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, @@ -971,7 +992,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); WARN_ON(call != trace_file->event_call); @@ -1003,7 +1024,7 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tk->tp.files, list) + trace_probe_for_each_link_rcu(link, &tk->tp) __kprobe_trace_func(tk, regs, link->file); } NOKPROBE_SYMBOL(kprobe_trace_func); @@ -1019,7 +1040,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); WARN_ON(call != trace_file->event_call); @@ -1053,7 +1074,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, { struct event_file_link *link; - list_for_each_entry_rcu(link, &tk->tp.files, list) + trace_probe_for_each_link_rcu(link, &tk->tp) __kretprobe_trace_func(tk, ri, regs, link->file); } NOKPROBE_SYMBOL(kretprobe_trace_func); @@ -1070,7 +1091,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1097,7 +1118,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1149,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1199,7 +1220,7 @@ static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1299,10 +1320,10 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) raw_cpu_inc(*tk->nhit); - if (tk->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) ret = kprobe_perf_func(tk, regs); #endif return ret; @@ -1316,10 +1337,10 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) raw_cpu_inc(*tk->nhit); - if (tk->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ @@ -1334,10 +1355,10 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static inline void init_trace_event_call(struct trace_kprobe *tk, - struct trace_event_call *call) +static inline void init_trace_event_call(struct trace_kprobe *tk) { - INIT_LIST_HEAD(&call->class->fields); + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; @@ -1353,37 +1374,14 @@ static inline void init_trace_event_call(struct trace_kprobe *tk, static int register_kprobe_event(struct trace_kprobe *tk) { - struct trace_event_call *call = &tk->tp.call; - int ret = 0; - - init_trace_event_call(tk, call); + init_trace_event_call(tk); - if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) - return -ENOMEM; - ret = register_trace_event(&call->event); - if (!ret) { - kfree(call->print_fmt); - return -ENODEV; - } - ret = trace_add_event_call(call); - if (ret) { - pr_info("Failed to register kprobe event: %s\n", - trace_event_name(call)); - kfree(call->print_fmt); - unregister_trace_event(&call->event); - } - return ret; + return trace_probe_register_event_call(&tk->tp); } static int unregister_kprobe_event(struct trace_kprobe *tk) { - int ret; - - /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tk->tp.call); - if (!ret) - kfree(tk->tp.call.print_fmt); - return ret; + return trace_probe_unregister_event_call(&tk->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1413,7 +1411,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, return ERR_CAST(tk); } - init_trace_event_call(tk, &tk->tp.call); + init_trace_event_call(tk); if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { ret = -ENOMEM; @@ -1421,12 +1419,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, } ret = __register_trace_kprobe(tk); - if (ret < 0) { - kfree(tk->tp.call.print_fmt); + if (ret < 0) goto error; - } - return &tk->tp.call; + return trace_probe_event_call(&tk->tp); error: free_trace_kprobe(tk); return ERR_PTR(ret); @@ -1445,11 +1441,50 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) __unregister_trace_kprobe(tk); - kfree(tk->tp.call.print_fmt); free_trace_kprobe(tk); } #endif /* CONFIG_PERF_EVENTS */ +static __init void enable_boot_kprobe_events(void) +{ + struct trace_array *tr = top_trace_array(); + struct trace_event_file *file; + struct trace_kprobe *tk; + struct dyn_event *pos; + + mutex_lock(&event_mutex); + for_each_trace_kprobe(tk, pos) { + list_for_each_entry(file, &tr->events, list) + if (file->event_call == trace_probe_event_call(&tk->tp)) + trace_event_enable_disable(file, 1, 0); + } + mutex_unlock(&event_mutex); +} + +static __init void setup_boot_kprobe_events(void) +{ + char *p, *cmd = kprobe_boot_events_buf; + int ret; + + strreplace(kprobe_boot_events_buf, ',', ' '); + + while (cmd && *cmd != '\0') { + p = strchr(cmd, ';'); + if (p) + *p++ = '\0'; + + ret = trace_run_command(cmd, create_or_delete_trace_kprobe); + if (ret) + pr_warn("Failed to add event(%d): %s\n", ret, cmd); + else + kprobe_boot_events_enabled = true; + + cmd = p; + } + + enable_boot_kprobe_events(); +} + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { @@ -1481,6 +1516,9 @@ static __init int init_kprobe_trace(void) if (!entry) pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); + + setup_boot_kprobe_events(); + return 0; } fs_initcall(init_kprobe_trace); @@ -1493,7 +1531,7 @@ find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) - if (file->event_call == &tk->tp.call) + if (file->event_call == trace_probe_event_call(&tk->tp)) return file; return NULL; @@ -1513,6 +1551,11 @@ static __init int kprobe_trace_self_tests_init(void) if (tracing_is_disabled()) return -ENODEV; + if (kprobe_boot_events_enabled) { + pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n"); + return 0; + } + target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index a347faced959..dbef0d135075 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -78,6 +78,8 @@ static const struct fetch_type probe_fetch_types[] = { /* Special types */ __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, + "__data_loc char[]"), /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), ASSIGN_FETCH_TYPE(u16, u16, 0), @@ -322,6 +324,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, { struct fetch_insn *code = *pcode; unsigned long param; + int deref = FETCH_OP_DEREF; long offset = 0; char *tmp; int ret = 0; @@ -394,9 +397,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '+': /* deref memory */ - arg++; /* Skip '+', because kstrtol() rejects it. */ - /* fall through */ case '-': + if (arg[1] == 'u') { + deref = FETCH_OP_UDEREF; + arg[1] = arg[0]; + arg++; + } + if (arg[0] == '+') + arg++; /* Skip '+', because kstrtol() rejects it. */ tmp = strchr(arg, '('); if (!tmp) { trace_probe_log_err(offs, DEREF_NEED_BRACE); @@ -432,7 +440,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } *pcode = code; - code->op = FETCH_OP_DEREF; + code->op = deref; code->offset = offset; } break; @@ -569,15 +577,17 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; /* Store operation */ - if (!strcmp(parg->type->name, "string")) { - if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && - code->op != FETCH_OP_COMM) { + if (!strcmp(parg->type->name, "string") || + !strcmp(parg->type->name, "ustring")) { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; goto fail; } - if (code->op != FETCH_OP_DEREF || parg->count) { + if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || + parg->count) { /* * IMM and COMM is pointing actual address, those must * be kept, and if parg->count != 0, this is an array @@ -590,12 +600,20 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; } } - code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + /* If op == DEREF, replace it with STRING */ + if (!strcmp(parg->type->name, "ustring") || + code->op == FETCH_OP_UDEREF) + code->op = FETCH_OP_ST_USTRING; + else + code->op = FETCH_OP_ST_STRING; code->size = parg->type->size; parg->dynamic = true; } else if (code->op == FETCH_OP_DEREF) { code->op = FETCH_OP_ST_MEM; code->size = parg->type->size; + } else if (code->op == FETCH_OP_UDEREF) { + code->op = FETCH_OP_ST_UMEM; + code->size = parg->type->size; } else { code++; if (code->op != FETCH_OP_NOP) { @@ -618,7 +636,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && - scode->op != FETCH_OP_ST_STRING) { + scode->op != FETCH_OP_ST_STRING && + scode->op != FETCH_OP_ST_USTRING) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; @@ -825,6 +844,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { + struct trace_event_call *call = trace_probe_event_call(tp); int len; char *print_fmt; @@ -836,7 +856,7 @@ int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) /* Second: actually write the @print_fmt */ __set_print_fmt(tp, print_fmt, len + 1, is_return); - tp->call.print_fmt = print_fmt; + call->print_fmt = print_fmt; return 0; } @@ -865,3 +885,105 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, } return 0; } + + +void trace_probe_cleanup(struct trace_probe *tp) +{ + struct trace_event_call *call = trace_probe_event_call(tp); + int i; + + for (i = 0; i < tp->nr_args; i++) + traceprobe_free_probe_arg(&tp->args[i]); + + kfree(call->class->system); + kfree(call->name); + kfree(call->print_fmt); +} + +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group) +{ + struct trace_event_call *call = trace_probe_event_call(tp); + + if (!event || !group) + return -EINVAL; + + call->class = &tp->class; + call->name = kstrdup(event, GFP_KERNEL); + if (!call->name) + return -ENOMEM; + + tp->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->class.system) { + kfree(call->name); + call->name = NULL; + return -ENOMEM; + } + INIT_LIST_HEAD(&tp->files); + INIT_LIST_HEAD(&tp->class.fields); + + return 0; +} + +int trace_probe_register_event_call(struct trace_probe *tp) +{ + struct trace_event_call *call = trace_probe_event_call(tp); + int ret; + + ret = register_trace_event(&call->event); + if (!ret) + return -ENODEV; + + ret = trace_add_event_call(call); + if (ret) + unregister_trace_event(&call->event); + + return ret; +} + +int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) +{ + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + INIT_LIST_HEAD(&link->list); + list_add_tail_rcu(&link->list, &tp->files); + trace_probe_set_flag(tp, TP_FLAG_TRACE); + return 0; +} + +struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, + struct trace_event_file *file) +{ + struct event_file_link *link; + + trace_probe_for_each_link(link, tp) { + if (link->file == file) + return link; + } + + return NULL; +} + +int trace_probe_remove_file(struct trace_probe *tp, + struct trace_event_file *file) +{ + struct event_file_link *link; + + link = trace_probe_get_file_link(tp, file); + if (!link) + return -ENOENT; + + list_del_rcu(&link->list); + synchronize_rcu(); + kfree(link); + + if (list_empty(&tp->files)) + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f9a8c632188b..d1714820efe1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -55,7 +55,6 @@ /* Flags for trace_probe */ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 -#define TP_FLAG_REGISTERED 4 /* data_loc: data location, compatible with u32 */ #define make_data_loc(len, offs) \ @@ -92,10 +91,13 @@ enum fetch_op { FETCH_OP_FOFFS, /* File offset: .immediate */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ + FETCH_OP_UDEREF, /* User-space Dereference: .offset */ // Stage 3 (store) ops FETCH_OP_ST_RAW, /* Raw: .size */ FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_UMEM, /* Mem: .offset, .size */ FETCH_OP_ST_STRING, /* String: .offset, .size */ + FETCH_OP_ST_USTRING, /* User String: .offset, .size */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op @@ -235,16 +237,71 @@ struct event_file_link { struct list_head list; }; +static inline bool trace_probe_test_flag(struct trace_probe *tp, + unsigned int flag) +{ + return !!(tp->flags & flag); +} + +static inline void trace_probe_set_flag(struct trace_probe *tp, + unsigned int flag) +{ + tp->flags |= flag; +} + +static inline void trace_probe_clear_flag(struct trace_probe *tp, + unsigned int flag) +{ + tp->flags &= ~flag; +} + static inline bool trace_probe_is_enabled(struct trace_probe *tp) { - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); + return trace_probe_test_flag(tp, TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static inline bool trace_probe_is_registered(struct trace_probe *tp) +static inline const char *trace_probe_name(struct trace_probe *tp) { - return !!(tp->flags & TP_FLAG_REGISTERED); + return trace_event_name(&tp->call); } +static inline const char *trace_probe_group_name(struct trace_probe *tp) +{ + return tp->call.class->system; +} + +static inline struct trace_event_call * + trace_probe_event_call(struct trace_probe *tp) +{ + return &tp->call; +} + +static inline int trace_probe_unregister_event_call(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + return trace_remove_event_call(&tp->call); +} + +static inline bool trace_probe_has_single_file(struct trace_probe *tp) +{ + return !!list_is_singular(&tp->files); +} + +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group); +void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_register_event_call(struct trace_probe *tp); +int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file); +int trace_probe_remove_file(struct trace_probe *tp, + struct trace_event_file *file); +struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, + struct trace_event_file *file); + +#define trace_probe_for_each_link(pos, tp) \ + list_for_each_entry(pos, &(tp)->files, list) +#define trace_probe_for_each_link_rcu(pos, tp) \ + list_for_each_entry_rcu(pos, &(tp)->files, list) + /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { @@ -257,18 +314,6 @@ static inline bool is_good_name(const char *name) return true; } -static inline struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) -{ - struct event_file_link *link; - - list_for_each_entry(link, &tp->files, list) - if (link->file == file) - return link; - - return NULL; -} - #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index c30c61f12ddd..e5282828f4a6 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -59,8 +59,13 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int fetch_store_strlen_user(unsigned long addr); +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base); static nokprobe_inline int probe_mem_read(void *dest, void *src, size_t size); +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size); /* From the 2nd stage, routine is same */ static nokprobe_inline int @@ -74,14 +79,21 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, stage2: /* 2nd stage: dereference memory if needed */ - while (code->op == FETCH_OP_DEREF) { - lval = val; - ret = probe_mem_read(&val, (void *)val + code->offset, - sizeof(val)); + do { + if (code->op == FETCH_OP_DEREF) { + lval = val; + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + } else if (code->op == FETCH_OP_UDEREF) { + lval = val; + ret = probe_mem_read_user(&val, + (void *)val + code->offset, sizeof(val)); + } else + break; if (ret) return ret; code++; - } + } while (1); s3 = code; stage3: @@ -91,6 +103,10 @@ stage3: ret = fetch_store_strlen(val + code->offset); code++; goto array; + } else if (code->op == FETCH_OP_ST_USTRING) { + ret += fetch_store_strlen_user(val + code->offset); + code++; + goto array; } else return -EILSEQ; } @@ -102,10 +118,17 @@ stage3: case FETCH_OP_ST_MEM: probe_mem_read(dest, (void *)val + code->offset, code->size); break; + case FETCH_OP_ST_UMEM: + probe_mem_read_user(dest, (void *)val + code->offset, code->size); + break; case FETCH_OP_ST_STRING: loc = *(u32 *)dest; ret = fetch_store_string(val + code->offset, dest, base); break; + case FETCH_OP_ST_USTRING: + loc = *(u32 *)dest; + ret = fetch_store_string_user(val + code->offset, dest, base); + break; default: return -EILSEQ; } @@ -123,7 +146,8 @@ array: total += ret; if (++i < code->param) { code = s3; - if (s3->op != FETCH_OP_ST_STRING) { + if (s3->op != FETCH_OP_ST_STRING && + s3->op != FETCH_OP_ST_USTRING) { dest += s3->size; val += s3->size; goto stage3; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7860e3f59fad..1ceedb9146b1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -140,6 +140,13 @@ probe_mem_read(void *dest, void *src, size_t size) return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + return probe_mem_read(dest, src, size); +} + /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. @@ -176,6 +183,12 @@ fetch_store_string(unsigned long addr, void *dest, void *base) return ret; } +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + return fetch_store_string(addr, dest, base); +} + /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) @@ -191,6 +204,12 @@ fetch_store_strlen(unsigned long addr) return (len > MAX_STRING_SIZE) ? 0 : len; } +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + return fetch_store_strlen(addr); +} + static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; @@ -270,8 +289,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_event_name(&tu->tp.call), event) == 0 && - (!system || strcmp(tu->tp.call.class->system, system) == 0); + return strcmp(trace_probe_name(&tu->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); } /* @@ -281,25 +300,17 @@ static struct trace_uprobe * alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - - if (!event || !group) - return ERR_PTR(-EINVAL); + int ret; tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); - tu->tp.call.class = &tu->tp.class; - tu->tp.call.name = kstrdup(event, GFP_KERNEL); - if (!tu->tp.call.name) - goto error; - - tu->tp.class.system = kstrdup(group, GFP_KERNEL); - if (!tu->tp.class.system) + ret = trace_probe_init(&tu->tp, event, group); + if (ret < 0) goto error; dyn_event_init(&tu->devent, &trace_uprobe_ops); - INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; @@ -307,25 +318,18 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) return tu; error: - kfree(tu->tp.call.name); kfree(tu); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } static void free_trace_uprobe(struct trace_uprobe *tu) { - int i; - if (!tu) return; - for (i = 0; i < tu->tp.nr_args; i++) - traceprobe_free_probe_arg(&tu->tp.args[i]); - path_put(&tu->path); - kfree(tu->tp.call.class->system); - kfree(tu->tp.call.name); + trace_probe_cleanup(&tu->tp); kfree(tu->filename); kfree(tu); } @@ -336,8 +340,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; for_each_trace_uprobe(tu, pos) - if (strcmp(trace_event_name(&tu->tp.call), event) == 0 && - strcmp(tu->tp.call.class->system, group) == 0) + if (strcmp(trace_probe_name(&tu->tp), event) == 0 && + strcmp(trace_probe_group_name(&tu->tp), group) == 0) return tu; return NULL; @@ -372,8 +376,8 @@ static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) struct trace_uprobe *tmp, *old = NULL; struct inode *new_inode = d_real_inode(new->path.dentry); - old = find_probe_event(trace_event_name(&new->tp.call), - new->tp.call.class->system); + old = find_probe_event(trace_probe_name(&new->tp), + trace_probe_group_name(&new->tp)); for_each_trace_uprobe(tmp, pos) { if ((old ? old != tmp : true) && @@ -578,6 +582,10 @@ static int trace_uprobe_create(int argc, const char **argv) goto error; } + ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)); + if (ret < 0) + goto error; + ret = register_trace_uprobe(tu); if (!ret) goto out; @@ -621,8 +629,8 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system, - trace_event_name(&tu->tp.call), tu->filename, + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, trace_probe_group_name(&tu->tp), + trace_probe_name(&tu->tp), tu->filename, (int)(sizeof(void *) * 2), tu->offset); if (tu->ref_ctr_offset) @@ -692,7 +700,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) tu = to_trace_uprobe(ev); seq_printf(m, " %s %-44s %15lu\n", tu->filename, - trace_event_name(&tu->tp.call), tu->nhit); + trace_probe_name(&tu->tp), tu->nhit); return 0; } @@ -818,7 +826,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct ring_buffer *buffer; void *data; int size, esize; - struct trace_event_call *call = &tu->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); @@ -860,7 +868,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, return 0; rcu_read_lock(); - list_for_each_entry_rcu(link, &tu->tp.files, list) + trace_probe_for_each_link_rcu(link, &tu->tp) __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); rcu_read_unlock(); @@ -874,7 +882,7 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct event_file_link *link; rcu_read_lock(); - list_for_each_entry_rcu(link, &tu->tp.files, list) + trace_probe_for_each_link_rcu(link, &tu->tp) __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); rcu_read_unlock(); } @@ -893,12 +901,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - trace_event_name(&tu->tp.call), + trace_probe_name(&tu->tp), entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { trace_seq_printf(s, "%s: (0x%lx)", - trace_event_name(&tu->tp.call), + trace_probe_name(&tu->tp), entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -921,26 +929,20 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, filter_func_t filter) { bool enabled = trace_probe_is_enabled(&tu->tp); - struct event_file_link *link = NULL; int ret; if (file) { - if (tu->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) return -EINTR; - link = kmalloc(sizeof(*link), GFP_KERNEL); - if (!link) - return -ENOMEM; - - link->file = file; - list_add_tail_rcu(&link->list, &tu->tp.files); - - tu->tp.flags |= TP_FLAG_TRACE; + ret = trace_probe_add_file(&tu->tp, file); + if (ret < 0) + return ret; } else { - if (tu->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) return -EINTR; - tu->tp.flags |= TP_FLAG_PROFILE; + trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE); } WARN_ON(!uprobe_filter_is_empty(&tu->filter)); @@ -970,13 +972,11 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, uprobe_buffer_disable(); err_flags: - if (file) { - list_del(&link->list); - kfree(link); - tu->tp.flags &= ~TP_FLAG_TRACE; - } else { - tu->tp.flags &= ~TP_FLAG_PROFILE; - } + if (file) + trace_probe_remove_file(&tu->tp, file); + else + trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); + return ret; } @@ -987,26 +987,18 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) return; if (file) { - struct event_file_link *link; - - link = find_event_file_link(&tu->tp, file); - if (!link) + if (trace_probe_remove_file(&tu->tp, file) < 0) return; - list_del_rcu(&link->list); - /* synchronize with u{,ret}probe_trace_func */ - synchronize_rcu(); - kfree(link); - - if (!list_empty(&tu->tp.files)) + if (trace_probe_is_enabled(&tu->tp)) return; - } + } else + trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->inode = NULL; - tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); } @@ -1126,7 +1118,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize) { - struct trace_event_call *call = &tu->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; struct hlist_head *head; void *data; @@ -1279,11 +1271,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) ucb = uprobe_buffer_get(); store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (tu->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) ret |= uprobe_trace_func(tu, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (tu->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) ret |= uprobe_perf_func(tu, regs, ucb, dsize); #endif uprobe_buffer_put(ucb); @@ -1314,11 +1306,11 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, ucb = uprobe_buffer_get(); store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (tu->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) uretprobe_trace_func(tu, func, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (tu->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) uretprobe_perf_func(tu, func, regs, ucb, dsize); #endif uprobe_buffer_put(ucb); @@ -1329,10 +1321,10 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static inline void init_trace_event_call(struct trace_uprobe *tu, - struct trace_event_call *call) +static inline void init_trace_event_call(struct trace_uprobe *tu) { - INIT_LIST_HEAD(&call->class->fields); + struct trace_event_call *call = trace_probe_event_call(&tu->tp); + call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; @@ -1343,43 +1335,14 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, static int register_uprobe_event(struct trace_uprobe *tu) { - struct trace_event_call *call = &tu->tp.call; - int ret = 0; - - init_trace_event_call(tu, call); - - if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) - return -ENOMEM; + init_trace_event_call(tu); - ret = register_trace_event(&call->event); - if (!ret) { - kfree(call->print_fmt); - return -ENODEV; - } - - ret = trace_add_event_call(call); - - if (ret) { - pr_info("Failed to register uprobe event: %s\n", - trace_event_name(call)); - kfree(call->print_fmt); - unregister_trace_event(&call->event); - } - - return ret; + return trace_probe_register_event_call(&tu->tp); } static int unregister_uprobe_event(struct trace_uprobe *tu) { - int ret; - - /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->tp.call); - if (ret) - return ret; - kfree(tu->tp.call.print_fmt); - tu->tp.call.print_fmt = NULL; - return 0; + return trace_probe_unregister_event_call(&tu->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1419,14 +1382,14 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); - init_trace_event_call(tu, &tu->tp.call); + init_trace_event_call(tu); if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { ret = -ENOMEM; goto error; } - return &tu->tp.call; + return trace_probe_event_call(&tu->tp); error: free_trace_uprobe(tu); return ERR_PTR(ret); @@ -1438,9 +1401,6 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) tu = container_of(event_call, struct trace_uprobe, tp.call); - kfree(tu->tp.call.print_fmt); - tu->tp.call.print_fmt = NULL; - free_trace_uprobe(tu); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index df3ade14ccbd..73956eaff8a9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -55,8 +55,8 @@ struct tp_probes { static inline void *allocate_probes(int count) { - struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func) - + sizeof(struct tp_probes), GFP_KERNEL); + struct tp_probes *p = kmalloc(struct_size(p, probes, count), + GFP_KERNEL); return p == NULL ? NULL : p->probes; } diff --git a/mm/maccess.c b/mm/maccess.c index 482d4d670f19..d065736f6b87 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -6,8 +6,20 @@ #include <linux/mm.h> #include <linux/uaccess.h> +static __always_inline long +probe_read_common(void *dst, const void __user *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + /** - * probe_kernel_read(): safely attempt to read from a location + * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk @@ -30,17 +42,41 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, - (__force const void __user *)src, size); - pagefault_enable(); + ret = probe_read_common(dst, (__force const void __user *)src, size); set_fs(old_fs); - return ret ? -EFAULT : 0; + return ret; } EXPORT_SYMBOL_GPL(probe_kernel_read); /** + * probe_user_read(): safely attempt to read from a user-space location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from. This must be a user address. + * @size: size of the data chunk + * + * Safely read from user address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_user_read(void *dst, const void __user *src, size_t size) + __attribute__((alias("__probe_user_read"))); + +long __probe_user_read(void *dst, const void __user *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(src, size)) + ret = probe_read_common(dst, src, size); + set_fs(old_fs); + + return ret; +} +EXPORT_SYMBOL_GPL(probe_user_read); + +/** * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written @@ -67,6 +103,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) } EXPORT_SYMBOL_GPL(probe_kernel_write); + /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. * @dst: Destination address, in kernel space. This buffer must be at @@ -106,3 +143,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) return ret ? -EFAULT : src - unsafe_addr; } + +/** + * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user + * address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @unsafe_addr: Unsafe user address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe user address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, + long count) +{ + mm_segment_t old_fs = get_fs(); + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(USER_DS); + pagefault_disable(); + ret = strncpy_from_user(dst, unsafe_addr, count); + pagefault_enable(); + set_fs(old_fs); + + if (ret >= count) { + ret = count; + dst[ret - 1] = '\0'; + } else if (ret > 0) { + ret++; + } + + return ret; +} + +/** + * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL. + * @unsafe_addr: The string to measure. + * @count: Maximum count (including NUL) + * + * Get the size of a NUL-terminated string in user space without pagefault. + * + * Returns the size of the string INCLUDING the terminating NUL. + * + * If the string is too long, returns a number larger than @count. User + * has to check the return value against "> count". + * On exception (or invalid count), returns 0. + * + * Unlike strnlen_user, this can be used from IRQ handler etc. because + * it disables pagefaults. + */ +long strnlen_unsafe_user(const void __user *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + int ret; + + set_fs(USER_DS); + pagefault_disable(); + ret = strnlen_user(unsafe_addr, count); + pagefault_enable(); + set_fs(old_fs); + + return ret; +} diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index b6866a05edd2..ed3ecfa422e1 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -194,12 +194,13 @@ PROBE ARGUMENT -------------- Each probe argument follows below syntax. - [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE] + [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE][@user] 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.) '$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters. 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo (*). Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal integers (x/x8/x16/x32/x64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail) On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid. +"@user" is a special attribute which means the LOCALVAR will be treated as a user-space memory. This is only valid for kprobe event. TYPES ----- diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 0c3b55d0617d..cd1eb73cfe83 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1562,6 +1562,17 @@ static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg) str = tmp + 1; } + tmp = strchr(str, '@'); + if (tmp && tmp != str && strcmp(tmp + 1, "user")) { /* user attr */ + if (!user_access_is_supported()) { + semantic_error("ftrace does not support user access\n"); + return -EINVAL; + } + *tmp = '\0'; + arg->user_access = true; + pr_debug("user_access "); + } + tmp = strchr(str, ':'); if (tmp) { /* Type setting */ *tmp = '\0'; diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 05c8d571a901..96a319cd2378 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -37,6 +37,7 @@ struct probe_trace_point { struct probe_trace_arg_ref { struct probe_trace_arg_ref *next; /* Next reference */ long offset; /* Offset value */ + bool user_access; /* User-memory access */ }; /* kprobe-tracer and uprobe-tracer tracing argument */ @@ -82,6 +83,7 @@ struct perf_probe_arg { char *var; /* Variable name */ char *type; /* Type name */ struct perf_probe_arg_field *field; /* Structure fields */ + bool user_access; /* User-memory access */ }; /* Perf probe probing event (point + arg) */ diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index c2998f90b23c..5b4d49382932 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -1005,6 +1005,7 @@ enum ftrace_readme { FTRACE_README_PROBE_TYPE_X = 0, FTRACE_README_KRETPROBE_OFFSET, FTRACE_README_UPROBE_REF_CTR, + FTRACE_README_USER_ACCESS, FTRACE_README_END, }; @@ -1017,6 +1018,7 @@ static struct { DEFINE_TYPE(FTRACE_README_PROBE_TYPE_X, "*type: * x8/16/32/64,*"), DEFINE_TYPE(FTRACE_README_KRETPROBE_OFFSET, "*place (kretprobe): *"), DEFINE_TYPE(FTRACE_README_UPROBE_REF_CTR, "*ref_ctr_offset*"), + DEFINE_TYPE(FTRACE_README_USER_ACCESS, "*[u]<offset>*"), }; static bool scan_ftrace_readme(enum ftrace_readme type) @@ -1077,3 +1079,8 @@ bool uprobe_ref_ctr_is_supported(void) { return scan_ftrace_readme(FTRACE_README_UPROBE_REF_CTR); } + +bool user_access_is_supported(void) +{ + return scan_ftrace_readme(FTRACE_README_USER_ACCESS); +} diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h index 2a249182f2a6..986c1c94f64f 100644 --- a/tools/perf/util/probe-file.h +++ b/tools/perf/util/probe-file.h @@ -70,6 +70,7 @@ int probe_cache__show_all_caches(struct strfilter *filter); bool probe_type_is_available(enum probe_type type); bool kretprobe_offset_is_supported(void); bool uprobe_ref_ctr_is_supported(void); +bool user_access_is_supported(void); #else /* ! HAVE_LIBELF_SUPPORT */ static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused, struct nsinfo *nsi __maybe_unused) { diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 7d8c99734928..025fc4491993 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -280,7 +280,7 @@ static_var: static int convert_variable_type(Dwarf_Die *vr_die, struct probe_trace_arg *tvar, - const char *cast) + const char *cast, bool user_access) { struct probe_trace_arg_ref **ref_ptr = &tvar->ref; Dwarf_Die type; @@ -320,7 +320,8 @@ static int convert_variable_type(Dwarf_Die *vr_die, pr_debug("%s type is %s.\n", dwarf_diename(vr_die), dwarf_diename(&type)); - if (cast && strcmp(cast, "string") == 0) { /* String type */ + if (cast && (!strcmp(cast, "string") || !strcmp(cast, "ustring"))) { + /* String type */ ret = dwarf_tag(&type); if (ret != DW_TAG_pointer_type && ret != DW_TAG_array_type) { @@ -343,6 +344,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, pr_warning("Out of memory error\n"); return -ENOMEM; } + (*ref_ptr)->user_access = user_access; } if (!die_compare_name(&type, "char") && !die_compare_name(&type, "unsigned char")) { @@ -397,7 +399,7 @@ formatted: static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, struct perf_probe_arg_field *field, struct probe_trace_arg_ref **ref_ptr, - Dwarf_Die *die_mem) + Dwarf_Die *die_mem, bool user_access) { struct probe_trace_arg_ref *ref = *ref_ptr; Dwarf_Die type; @@ -434,6 +436,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, *ref_ptr = ref; } ref->offset += dwarf_bytesize(&type) * field->index; + ref->user_access = user_access; goto next; } else if (tag == DW_TAG_pointer_type) { /* Check the pointer and dereference */ @@ -505,17 +508,18 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, } } ref->offset += (long)offs; + ref->user_access = user_access; /* If this member is unnamed, we need to reuse this field */ if (!dwarf_diename(die_mem)) return convert_variable_fields(die_mem, varname, field, - &ref, die_mem); + &ref, die_mem, user_access); next: /* Converting next field */ if (field->next) return convert_variable_fields(die_mem, field->name, - field->next, &ref, die_mem); + field->next, &ref, die_mem, user_access); else return 0; } @@ -541,11 +545,12 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf) else if (ret == 0 && pf->pvar->field) { ret = convert_variable_fields(vr_die, pf->pvar->var, pf->pvar->field, &pf->tvar->ref, - &die_mem); + &die_mem, pf->pvar->user_access); vr_die = &die_mem; } if (ret == 0) - ret = convert_variable_type(vr_die, pf->tvar, pf->pvar->type); + ret = convert_variable_type(vr_die, pf->tvar, pf->pvar->type, + pf->pvar->user_access); /* *expr will be cached in libdw. Don't free it. */ return ret; } diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 6d5e9e87c4b7..063ecb290a5a 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -23,9 +23,15 @@ echo " If <dir> is -, all logs output in console only" exit $1 } +# default error +err_ret=1 + +# kselftest skip code is 4 +err_skip=4 + errexit() { # message echo "Error: $1" 1>&2 - exit 1 + exit $err_ret } # Ensuring user privilege @@ -116,11 +122,31 @@ parse_opts() { # opts } # Parameters -DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1` -if [ -z "$DEBUGFS_DIR" ]; then - TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1` -else - TRACING_DIR=$DEBUGFS_DIR/tracing +TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1` +if [ -z "$TRACING_DIR" ]; then + DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1` + if [ -z "$DEBUGFS_DIR" ]; then + # If tracefs exists, then so does /sys/kernel/tracing + if [ -d "/sys/kernel/tracing" ]; then + mount -t tracefs nodev /sys/kernel/tracing || + errexit "Failed to mount /sys/kernel/tracing" + TRACING_DIR="/sys/kernel/tracing" + # If debugfs exists, then so does /sys/kernel/debug + elif [ -d "/sys/kernel/debug" ]; then + mount -t debugfs nodev /sys/kernel/debug || + errexit "Failed to mount /sys/kernel/debug" + TRACING_DIR="/sys/kernel/debug/tracing" + else + err_ret=$err_skip + errexit "debugfs and tracefs are not configured in this kernel" + fi + else + TRACING_DIR="$DEBUGFS_DIR/tracing" + fi +fi +if [ ! -d "$TRACING_DIR" ]; then + err_ret=$err_skip + errexit "ftrace is not configured in this kernel" fi TOP_DIR=`absdir $0` diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 779ec11f61bd..1d96c5f7e402 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -91,8 +91,8 @@ initialize_ftrace() { # Reset ftrace to initial-state reset_events_filter reset_ftrace_filter disable_events - echo > set_event_pid # event tracer is always on - echo > set_ftrace_pid + [ -f set_event_pid ] && echo > set_event_pid + [ -f set_ftrace_pid ] && echo > set_ftrace_pid [ -f set_ftrace_filter ] && echo | tee set_ftrace_* [ -f set_graph_function ] && echo | tee set_graph_* [ -f stack_trace_filter ] && echo > stack_trace_filter diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc new file mode 100644 index 000000000000..0f60087583d8 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event user-memory access + +[ -f kprobe_events ] || exit_unsupported # this is configurable + +grep -q '\$arg<N>' README || exit_unresolved # depends on arch +grep -A10 "fetcharg:" README | grep -q 'ustring' || exit_unsupported +grep -A10 "fetcharg:" README | grep -q '\[u\]<offset>' || exit_unsupported + +:;: "user-memory access syntax and ustring working on user memory";: +echo 'p:myevent do_sys_open path=+0($arg2):ustring path2=+u0($arg2):string' \ + > kprobe_events + +grep myevent kprobe_events | \ + grep -q 'path=+0($arg2):ustring path2=+u0($arg2):string' +echo 1 > events/kprobes/myevent/enable +echo > /dev/null +echo 0 > events/kprobes/myevent/enable + +grep myevent trace | grep -q 'path="/dev/null" path2="/dev/null"' + +:;: "user-memory access syntax and ustring not working with kernel memory";: +echo 'p:myevent vfs_symlink path=+0($arg3):ustring path2=+u0($arg3):string' \ + > kprobe_events +echo 1 > events/kprobes/myevent/enable +ln -s foo $TMPDIR/bar +echo 0 > events/kprobes/myevent/enable + +grep myevent trace | grep -q 'path=(fault) path2=(fault)' + +exit 0 |