diff options
-rw-r--r-- | include/linux/bpf.h | 2 | ||||
-rw-r--r-- | include/linux/perf_event.h | 4 | ||||
-rw-r--r-- | include/linux/trace_events.h | 9 | ||||
-rw-r--r-- | include/trace/perf.h | 23 | ||||
-rw-r--r-- | include/trace/trace_events.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 1 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 2 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 6 | ||||
-rw-r--r-- | kernel/events/core.c | 27 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 85 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 40 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 18 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 10 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 5 | ||||
-rw-r--r-- | samples/bpf/Makefile | 5 | ||||
-rw-r--r-- | samples/bpf/bpf_load.c | 26 | ||||
-rw-r--r-- | samples/bpf/offwaketime_kern.c | 26 | ||||
-rw-r--r-- | samples/bpf/test_overhead_kprobe_kern.c | 41 | ||||
-rw-r--r-- | samples/bpf/test_overhead_tp_kern.c | 36 | ||||
-rw-r--r-- | samples/bpf/test_overhead_user.c | 162 |
21 files changed, 475 insertions, 69 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 21ee41b92e8a..b2365a6eba3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -131,6 +131,7 @@ struct bpf_prog_type_list { struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; + u32 max_ctx_offset; const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; @@ -160,6 +161,7 @@ struct bpf_array { #define MAX_TAIL_CALL_CNT 32 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); +u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_fd_array_map_clear(struct bpf_map *map); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f291275ffd71..eb41b535ef38 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { - memset(regs, 0, sizeof(*regs)); - perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } @@ -1018,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void) } extern void perf_event_init(void); -extern void perf_tp_event(u64 addr, u64 count, void *record, +extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0810f81b6db2..fe6441203b59 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type, int is_signed, int filter_type); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); +extern int trace_event_get_offsets(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) @@ -605,15 +606,15 @@ extern void perf_trace_del(struct perf_event *event, int flags); extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); -extern void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp); +void perf_trace_buf_update(void *record, u16 type); +void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); static inline void -perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, +perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, struct task_struct *task) { - perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); + perf_tp_event(type, count, raw_data, size, regs, head, rctx, task); } #endif diff --git a/include/trace/perf.h b/include/trace/perf.h index 26486fcd74ce..a182306eefd7 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -20,9 +20,6 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) -#undef __perf_addr -#define __perf_addr(a) (__addr = (a)) - #undef __perf_count #define __perf_count(c) (__count = (c)) @@ -37,8 +34,9 @@ perf_trace_##call(void *__data, proto) \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ struct trace_event_raw_##call *entry; \ + struct bpf_prog *prog = event_call->prog; \ struct pt_regs *__regs; \ - u64 __addr = 0, __count = 1; \ + u64 __count = 1; \ struct task_struct *__task = NULL; \ struct hlist_head *head; \ int __entry_size; \ @@ -48,7 +46,7 @@ perf_trace_##call(void *__data, proto) \ __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ \ head = this_cpu_ptr(event_call->perf_events); \ - if (__builtin_constant_p(!__task) && !__task && \ + if (!prog && __builtin_constant_p(!__task) && !__task && \ hlist_empty(head)) \ return; \ \ @@ -56,8 +54,7 @@ perf_trace_##call(void *__data, proto) \ sizeof(u64)); \ __entry_size -= sizeof(u32); \ \ - entry = perf_trace_buf_prepare(__entry_size, \ - event_call->event.type, &__regs, &rctx); \ + entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx); \ if (!entry) \ return; \ \ @@ -67,8 +64,16 @@ perf_trace_##call(void *__data, proto) \ \ { assign; } \ \ - perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ - __count, __regs, head, __task); \ + if (prog) { \ + *(struct pt_regs **)entry = __regs; \ + if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \ + perf_swevent_put_recursion_context(rctx); \ + return; \ + } \ + } \ + perf_trace_buf_submit(entry, __entry_size, rctx, \ + event_call->event.type, __count, __regs, \ + head, __task); \ } /* diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 170c93bbdbb7..80679a9fae65 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -652,9 +652,6 @@ static inline notrace int trace_event_get_offsets_##call( \ #undef TP_fast_assign #define TP_fast_assign(args...) args -#undef __perf_addr -#define __perf_addr(a) (a) - #undef __perf_count #define __perf_count(c) (c) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23917bb47bf3..70eda5aeb304 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 499d9e933f8e..35114725cf30 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,7 +116,7 @@ free_smap: return ERR_PTR(err); } -static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) { struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e08f8e9b771..58792fed5678 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, enum bpf_access_type t) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) + env->prog->aux->ops->is_valid_access(off, size, t)) { + /* remember the offset of last byte accessed in ctx */ + if (env->prog->aux->max_ctx_offset < off + size) + env->prog->aux->max_ctx_offset = off + size; return 0; + } verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; diff --git a/kernel/events/core.c b/kernel/events/core.c index de24fbce5277..9a01019ff7c8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -inline void perf_swevent_put_recursion_context(int rctx) +void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { @@ -6987,7 +6988,7 @@ static int perf_tp_event_match(struct perf_event *event, return 1; } -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, +void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { @@ -6999,9 +7000,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, .data = record, }; - perf_sample_data_init(&data, addr, 0); + perf_sample_data_init(&data, 0, 0); data.raw = &raw; + perf_trace_buf_update(record, event_type); + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); @@ -7104,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { + bool is_kprobe, is_tracepoint; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -7112,20 +7116,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) - /* bpf programs can only be attached to u/kprobes */ + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; + if (!is_kprobe && !is_tracepoint) + /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_KPROBE) { + if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } + if (is_tracepoint) { + int off = trace_event_get_offsets(event->tp_event); + + if (prog->aux->max_ctx_offset > off) { + bpf_prog_put(prog); + return -EACCES; + } + } event->tp_event->prog = prog; return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e4ffb3ace5f..413ec5614180 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + default: + return NULL; + } +} + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; default: - return NULL; + return tracing_func_proto(func_id); } } @@ -332,9 +340,82 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; +static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +{ + /* + * r1 points to perf tracepoint buffer where first 8 bytes are hidden + * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it + * from there and call the same bpf_perf_event_output() helper + */ + u64 ctx = *(long *)r1; + + return bpf_perf_event_output(ctx, r2, index, r4, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { + .func = bpf_perf_event_output_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + +static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + u64 ctx = *(long *)r1; + + return bpf_get_stackid(ctx, r2, r3, r4, r5); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_tp = { + .func = bpf_get_stackid_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static const struct bpf_verifier_ops tracepoint_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +static struct bpf_prog_type_list tracepoint_tl = { + .ops = &tracepoint_prog_ops, + .type = BPF_PROG_TYPE_TRACEPOINT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); + bpf_register_prog_type(&tracepoint_tl); return 0; } late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 00df25fd86ef..5a927075977f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -260,42 +260,43 @@ void perf_trace_del(struct perf_event *p_event, int flags) tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp) +void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { - struct trace_entry *entry; - unsigned long flags; char *raw_data; - int pc; + int rctx; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough")) return NULL; - pc = preempt_count(); - - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) + *rctxp = rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) return NULL; if (regs) - *regs = this_cpu_ptr(&__perf_regs[*rctxp]); - raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + *regs = this_cpu_ptr(&__perf_regs[rctx]); + raw_data = this_cpu_ptr(perf_trace_buf[rctx]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); +NOKPROBE_SYMBOL(perf_trace_buf_alloc); + +void perf_trace_buf_update(void *record, u16 type) +{ + struct trace_entry *entry = record; + int pc = preempt_count(); + unsigned long flags; - entry = (struct trace_entry *)raw_data; local_save_flags(flags); tracing_generic_entry_update(entry, flags, pc); entry->type = type; - - return raw_data; } -EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); -NOKPROBE_SYMBOL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void @@ -316,15 +317,16 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + memset(®s, 0, sizeof(regs)); perf_fetch_caller_regs(®s); - entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) return; entry->ip = ip; entry->parent_ip = parent_ip; - perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, head, NULL); #undef ENTRY_SIZE diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05ddc0820771..ced963049e0a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call) } } +/* + * run-time version of trace_event_get_offsets_<call>() that returns the last + * accessible offset of trace fields excluding __dynamic_array bytes + */ +int trace_event_get_offsets(struct trace_event_call *call) +{ + struct ftrace_event_field *tail; + struct list_head *head; + + head = trace_get_fields(call); + /* + * head->next points to the last field with the largest offset, + * since it was added last by trace_define_field() + */ + tail = list_first_entry(head, struct ftrace_event_field, link); + return tail->offset + tail->size; +} + int trace_event_raw_init(struct trace_event_call *call) { int id; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 919e0ddd8fcc..5546eec0505f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) return; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) return; entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e78f364cc192..b2b6efc083a4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, NULL, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(rec, size, rctx, + sys_data->enter_event->event.type, 1, regs, + head, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, NULL, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, + 1, regs, head, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7915142c89e4..c53485441c88 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) goto out; @@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, memset(data + len, 0, size - esize - len); } - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); out: preempt_enable(); } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 502c9fc8db85..9959771bf808 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -19,6 +19,7 @@ hostprogs-y += lathist hostprogs-y += offwaketime hostprogs-y += spintest hostprogs-y += map_perf_test +hostprogs-y += test_overhead test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -38,6 +39,7 @@ lathist-objs := bpf_load.o libbpf.o lathist_user.o offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o spintest-objs := bpf_load.o libbpf.o spintest_user.o map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o +test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -56,6 +58,8 @@ always += lathist_kern.o always += offwaketime_kern.o always += spintest_kern.o always += map_perf_test_kern.o +always += test_overhead_tp_kern.o +always += test_overhead_kprobe_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -75,6 +79,7 @@ HOSTLOADLIBES_lathist += -lelf HOSTLOADLIBES_offwaketime += -lelf HOSTLOADLIBES_spintest += -lelf HOSTLOADLIBES_map_perf_test += -lelf -lrt +HOSTLOADLIBES_test_overhead += -lelf -lrt # point this to your LLVM backend with bpf support LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 58f86bd11b3d..022af71c2bb5 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -49,6 +49,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_socket = strncmp(event, "socket", 6) == 0; bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; + bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; enum bpf_prog_type prog_type; char buf[256]; int fd, efd, err, id; @@ -63,6 +64,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SOCKET_FILTER; } else if (is_kprobe || is_kretprobe) { prog_type = BPF_PROG_TYPE_KPROBE; + } else if (is_tracepoint) { + prog_type = BPF_PROG_TYPE_TRACEPOINT; } else { printf("Unknown event '%s'\n", event); return -1; @@ -111,12 +114,23 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) event, strerror(errno)); return -1; } - } - strcpy(buf, DEBUGFS); - strcat(buf, "events/kprobes/"); - strcat(buf, event); - strcat(buf, "/id"); + strcpy(buf, DEBUGFS); + strcat(buf, "events/kprobes/"); + strcat(buf, event); + strcat(buf, "/id"); + } else if (is_tracepoint) { + event += 11; + + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + strcpy(buf, DEBUGFS); + strcat(buf, "events/"); + strcat(buf, event); + strcat(buf, "/id"); + } efd = open(buf, O_RDONLY, 0); if (efd < 0) { @@ -304,6 +318,7 @@ int load_bpf_file(char *path) if (memcmp(shname_prog, "kprobe/", 7) == 0 || memcmp(shname_prog, "kretprobe/", 10) == 0 || + memcmp(shname_prog, "tracepoint/", 11) == 0 || memcmp(shname_prog, "socket", 6) == 0) load_and_attach(shname_prog, insns, data_prog->d_size); } @@ -320,6 +335,7 @@ int load_bpf_file(char *path) if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 || + memcmp(shname, "tracepoint/", 11) == 0 || memcmp(shname, "socket", 6) == 0) load_and_attach(shname, data->d_buf, data->d_size); } diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index c0aa5a9b9c48..983629a31c79 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -73,7 +73,7 @@ int waker(struct pt_regs *ctx) return 0; } -static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta) +static inline int update_counts(void *ctx, u32 pid, u64 delta) { struct key_t key = {}; struct wokeby_t *woke; @@ -100,15 +100,33 @@ static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta) return 0; } +#if 1 +/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ +struct sched_switch_args { + unsigned long long pad; + char prev_comm[16]; + int prev_pid; + int prev_prio; + long long prev_state; + char next_comm[16]; + int next_pid; + int next_prio; +}; +SEC("tracepoint/sched/sched_switch") +int oncpu(struct sched_switch_args *ctx) +{ + /* record previous thread sleep time */ + u32 pid = ctx->prev_pid; +#else SEC("kprobe/finish_task_switch") int oncpu(struct pt_regs *ctx) { struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + /* record previous thread sleep time */ + u32 pid = _(p->pid); +#endif u64 delta, ts, *tsp; - u32 pid; - /* record previous thread sleep time */ - pid = _(p->pid); ts = bpf_ktime_get_ns(); bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c new file mode 100644 index 000000000000..468a66a92ef9 --- /dev/null +++ b/samples/bpf/test_overhead_kprobe_kern.c @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) + +SEC("kprobe/__set_task_comm") +int prog(struct pt_regs *ctx) +{ + struct signal_struct *signal; + struct task_struct *tsk; + char oldcomm[16] = {}; + char newcomm[16] = {}; + u16 oom_score_adj; + u32 pid; + + tsk = (void *)PT_REGS_PARM1(ctx); + + pid = _(tsk->pid); + bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm); + bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx)); + signal = _(tsk->signal); + oom_score_adj = _(signal->oom_score_adj); + return 0; +} + +SEC("kprobe/urandom_read") +int prog2(struct pt_regs *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c new file mode 100644 index 000000000000..38f5c0b9da9f --- /dev/null +++ b/samples/bpf/test_overhead_tp_kern.c @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* from /sys/kernel/debug/tracing/events/task/task_rename/format */ +struct task_rename { + __u64 pad; + __u32 pid; + char oldcomm[16]; + char newcomm[16]; + __u16 oom_score_adj; +}; +SEC("tracepoint/task/task_rename") +int prog(struct task_rename *ctx) +{ + return 0; +} + +/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */ +struct urandom_read { + __u64 pad; + int got_bits; + int pool_left; + int input_left; +}; +SEC("tracepoint/random/urandom_read") +int prog2(struct urandom_read *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c new file mode 100644 index 000000000000..d291167fd3c7 --- /dev/null +++ b/samples/bpf/test_overhead_user.c @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define _GNU_SOURCE +#include <sched.h> +#include <stdio.h> +#include <sys/types.h> +#include <asm/unistd.h> +#include <fcntl.h> +#include <unistd.h> +#include <assert.h> +#include <sys/wait.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include <string.h> +#include <time.h> +#include <sys/resource.h> +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_CNT 1000000 + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static void test_task_rename(int cpu) +{ + __u64 start_time; + char buf[] = "test\n"; + int i, fd; + + fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (fd < 0) { + printf("couldn't open /proc\n"); + exit(1); + } + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + write(fd, buf, sizeof(buf)); + printf("task_rename:%d: %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + close(fd); +} + +static void test_urandom_read(int cpu) +{ + __u64 start_time; + char buf[4]; + int i, fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + printf("couldn't open /dev/urandom\n"); + exit(1); + } + start_time = time_get_ns(); + for (i = 0; i < MAX_CNT; i++) + read(fd, buf, sizeof(buf)); + printf("urandom_read:%d: %lld events per sec\n", + cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + close(fd); +} + +static void loop(int cpu, int flags) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + if (flags & 1) + test_task_rename(cpu); + if (flags & 2) + test_urandom_read(cpu); +} + +static void run_perf_test(int tasks, int flags) +{ + pid_t pid[tasks]; + int i; + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + loop(i, flags); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void unload_progs(void) +{ + close(prog_fd[0]); + close(prog_fd[1]); + close(event_fd[0]); + close(event_fd[1]); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + int num_cpu = 8; + int test_flags = ~0; + + setrlimit(RLIMIT_MEMLOCK, &r); + + if (argc > 1) + test_flags = atoi(argv[1]) ? : test_flags; + if (argc > 2) + num_cpu = atoi(argv[2]) ? : num_cpu; + + if (test_flags & 0x3) { + printf("BASE\n"); + run_perf_test(num_cpu, test_flags); + } + + if (test_flags & 0xC) { + snprintf(filename, sizeof(filename), + "%s_kprobe_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/KPROBE\n"); + run_perf_test(num_cpu, test_flags >> 2); + unload_progs(); + } + + if (test_flags & 0x30) { + snprintf(filename, sizeof(filename), + "%s_tp_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/TRACEPOINT\n"); + run_perf_test(num_cpu, test_flags >> 4); + unload_progs(); + } + + return 0; +} |