diff options
-rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 150 | ||||
-rw-r--r-- | include/linux/bpf.h | 22 | ||||
-rw-r--r-- | include/linux/filter.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 10 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 113 | ||||
-rw-r--r-- | kernel/bpf/core.c | 73 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 23 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 17 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 2 | ||||
-rw-r--r-- | net/core/filter.c | 2 | ||||
-rw-r--r-- | samples/bpf/Makefile | 8 | ||||
-rw-r--r-- | samples/bpf/bpf_helpers.h | 4 | ||||
-rw-r--r-- | samples/bpf/bpf_load.c | 57 | ||||
-rw-r--r-- | samples/bpf/sockex3_kern.c | 303 | ||||
-rw-r--r-- | samples/bpf/sockex3_user.c | 66 | ||||
-rw-r--r-- | samples/bpf/tracex5_kern.c | 75 | ||||
-rw-r--r-- | samples/bpf/tracex5_user.c | 46 |
17 files changed, 928 insertions, 45 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99f76103c6b7..2ca777635d8e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <asm/cacheflush.h> +#include <linux/bpf.h> int bpf_jit_enable __read_mostly; @@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) return ptr + len; } -#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) +#define EMIT(bytes, len) \ + do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -186,31 +188,31 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) +#define STACKSIZE \ + (MAX_BPF_STACK + \ + 32 /* space for rbx, r13, r14, r15 */ + \ + 8 /* space for skb_copy_bits() buffer */) + +#define PROLOGUE_SIZE 51 + +/* emit x64 prologue code for BPF program and check it's size. + * bpf_tail_call helper will skip it while jumping into another program + */ +static void emit_prologue(u8 **pprog) { - struct bpf_insn *insn = bpf_prog->insnsi; - int insn_cnt = bpf_prog->len; - bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); - bool seen_exit = false; - u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i; - int proglen = 0; - u8 *prog = temp; - int stacksize = MAX_BPF_STACK + - 32 /* space for rbx, r13, r14, r15 */ + - 8 /* space for skb_copy_bits() buffer */; + u8 *prog = *pprog; + int cnt = 0; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, stacksize */ - EMIT3_off32(0x48, 0x81, 0xEC, stacksize); + /* sub rsp, STACKSIZE */ + EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); /* all classic BPF filters use R6(rbx) save it */ /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); /* clear A and X registers */ EMIT2(0x31, 0xc0); /* xor eax, eax */ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + *pprog = prog; +} + +/* generate the following code: + * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... + * if (index >= array->map.max_entries) + * goto out; + * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * prog = array->prog[index]; + * if (prog == NULL) + * goto out; + * goto *(prog->bpf_func + prologue_size); + * out: + */ +static void emit_bpf_tail_call(u8 **pprog) +{ + u8 *prog = *pprog; + int label1, label2, label3; + int cnt = 0; + + /* rdi - pointer to ctx + * rsi - pointer to bpf_array + * rdx - index in bpf_array + */ + + /* if (index >= array->map.max_entries) + * goto out; + */ + EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + offsetof(struct bpf_array, map.max_entries)); + EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ +#define OFFSET1 44 /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ +#define OFFSET2 33 + EMIT2(X86_JA, OFFSET2); /* ja out */ + label2 = cnt; + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + + /* prog = array->prog[index]; */ + EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */ + EMIT1(offsetof(struct bpf_array, prog)); + EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ + + /* if (prog == NULL) + * goto out; + */ + EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ +#define OFFSET3 10 + EMIT2(X86_JE, OFFSET3); /* je out */ + label3 = cnt; + + /* goto *(prog->bpf_func + prologue_size); */ + EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + + /* now we're ready to jump into next BPF program + * rdi == ctx (1st arg) + * rax == prog->bpf_func + prologue_size + */ + EMIT2(0xFF, 0xE0); /* jmp rax */ + + /* out: */ + BUILD_BUG_ON(cnt - label1 != OFFSET1); + BUILD_BUG_ON(cnt - label2 != OFFSET2); + BUILD_BUG_ON(cnt - label3 != OFFSET3); + *pprog = prog; +} + +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + int oldproglen, struct jit_context *ctx) +{ + struct bpf_insn *insn = bpf_prog->insnsi; + int insn_cnt = bpf_prog->len; + bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_exit = false; + u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + int i, cnt = 0; + int proglen = 0; + u8 *prog = temp; + + emit_prologue(&prog); + if (seen_ld_abs) { /* r9d : skb->len - skb->data_len (headlen) * r10 : skb->data @@ -739,6 +837,10 @@ xadd: if (is_imm8(insn->off)) } break; + case BPF_JMP | BPF_CALL | BPF_X: + emit_bpf_tail_call(&prog); + break; + /* cond jump */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: @@ -891,13 +993,13 @@ common_load: /* update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d5cda067115a..8821b9a8689e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -126,6 +126,27 @@ struct bpf_prog_aux { struct work_struct work; }; +struct bpf_array { + struct bpf_map map; + u32 elem_size; + /* 'ownership' of prog_array is claimed by the first program that + * is going to use this map or by the first program which FD is stored + * in the map to make sure that all callers and callees have the same + * prog_type and JITed flag + */ + enum bpf_prog_type owner_prog_type; + bool owner_jited; + union { + char value[0] __aligned(8); + struct bpf_prog *prog[0] __aligned(8); + }; +}; +#define MAX_TAIL_CALL_CNT 32 + +u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); +void bpf_prog_array_map_clear(struct bpf_map *map); +bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); + #ifdef CONFIG_BPF_SYSCALL void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); @@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; +extern const struct bpf_func_proto bpf_tail_call_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 200be4a74a33..17724f6ea983 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) int sk_filter(struct sock *sk, struct sk_buff *skb); -void bpf_prog_select_runtime(struct bpf_prog *fp); +int bpf_prog_select_runtime(struct bpf_prog *fp); void bpf_prog_free(struct bpf_prog *fp); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9ebdf5701e8..f0a9af8b4dae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,7 @@ enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, }; enum bpf_prog_type { @@ -210,6 +211,15 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_l4_csum_replace, + + /** + * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program + * @ctx: context pointer passed to next program + * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY + * @index: index inside array that selects specific program to run + * Return: 0 on success + */ + BPF_FUNC_tail_call, __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8a6616583f38..614bcd4c1d74 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,12 +14,7 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/mm.h> - -struct bpf_array { - struct bpf_map map; - u32 elem_size; - char value[0] __aligned(8); -}; +#include <linux/filter.h> /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) @@ -154,3 +149,109 @@ static int __init register_array_map(void) return 0; } late_initcall(register_array_map); + +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + /* only bpf_prog file descriptors can be stored in prog_array map */ + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + return array_map_alloc(attr); +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + synchronize_rcu(); + + /* make sure it's empty */ + for (i = 0; i < array->map.max_entries; i++) + BUG_ON(array->prog[i] != NULL); + kvfree(array); +} + +static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* only called from syscall */ +static int prog_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog, *old_prog; + u32 index = *(u32 *)key, ufd; + + if (map_flags != BPF_ANY) + return -EINVAL; + + if (index >= array->map.max_entries) + return -E2BIG; + + ufd = *(u32 *)value; + prog = bpf_prog_get(ufd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return -EINVAL; + } + + old_prog = xchg(array->prog + index, prog); + if (old_prog) + bpf_prog_put(old_prog); + + return 0; +} + +static int prog_array_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *old_prog; + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return -E2BIG; + + old_prog = xchg(array->prog + index, NULL); + if (old_prog) { + bpf_prog_put(old_prog); + return 0; + } else { + return -ENOENT; + } +} + +/* decrement refcnt of all bpf_progs that are stored in this map */ +void bpf_prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + for (i = 0; i < array->map.max_entries; i++) + prog_array_map_delete_elem(map, &i); +} + +static const struct bpf_map_ops prog_array_ops = { + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = prog_array_map_lookup_elem, + .map_update_elem = prog_array_map_update_elem, + .map_delete_elem = prog_array_map_delete_elem, +}; + +static struct bpf_map_type_list prog_array_type __read_mostly = { + .ops = &prog_array_ops, + .type = BPF_MAP_TYPE_PROG_ARRAY, +}; + +static int __init register_prog_array_map(void) +{ + bpf_register_map_type(&prog_array_type); + return 0; +} +late_initcall(register_prog_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 54f0e7fcd0e2..d44b25cbe460 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return 0; } +const struct bpf_func_proto bpf_tail_call_proto = { + .func = NULL, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, + [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, @@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; + u32 tail_call_cnt = 0; void *ptr; int off; @@ -431,6 +442,30 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_TAIL_CALL: { + struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog; + u64 index = BPF_R3; + + if (unlikely(index >= array->map.max_entries)) + goto out; + + if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + goto out; + + tail_call_cnt++; + + prog = READ_ONCE(array->prog[index]); + if (unlikely(!prog)) + goto out; + + ARG1 = BPF_R1; + insn = prog->insnsi; + goto select_insn; +out: + CONT; + } /* JMP */ JMP_JA: insn += insn->off; @@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog) { } +bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) +{ + if (array->owner_prog_type) { + if (array->owner_prog_type != fp->type) + return false; + if (array->owner_jited != fp->jited) + return false; + } else { + array->owner_prog_type = fp->type; + array->owner_jited = fp->jited; + } + return true; +} + +static int check_tail_call(const struct bpf_prog *fp) +{ + struct bpf_prog_aux *aux = fp->aux; + int i; + + for (i = 0; i < aux->used_map_cnt; i++) { + struct bpf_array *array; + struct bpf_map *map; + + map = aux->used_maps[i]; + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + array = container_of(map, struct bpf_array, map); + if (!bpf_prog_array_compatible(array, fp)) + return -EINVAL; + } + + return 0; +} + /** * bpf_prog_select_runtime - select execution runtime for BPF program * @fp: bpf_prog populated with internal BPF program @@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog) * try to JIT internal BPF program, if JIT is not available select interpreter * BPF program will be executed via BPF_PROG_RUN() macro */ -void bpf_prog_select_runtime(struct bpf_prog *fp) +int bpf_prog_select_runtime(struct bpf_prog *fp) { fp->bpf_func = (void *) __bpf_prog_run; @@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) bpf_int_jit_compile(fp); /* Lock whole bpf_prog as read-only */ bpf_prog_lock_ro(fp); + + return check_tail_call(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3bae6c591914..98a69bd83069 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + /* prog_array stores refcnt-ed bpf_prog pointers + * release them all when user space closes prog_array_fd + */ + bpf_prog_array_map_clear(map); + bpf_map_put(map); return 0; } @@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog) */ BUG_ON(!prog->aux->ops->get_func_proto); + if (insn->imm == BPF_FUNC_tail_call) { + /* mark bpf_tail_call as different opcode + * to avoid conditional branch in + * interpeter for every normal call + * and to prevent accidental JITing by + * JIT compiler that doesn't support + * bpf_tail_call yet + */ + insn->imm = 0; + insn->code |= BPF_X; + continue; + } + fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions @@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - bpf_prog_select_runtime(prog); + err = bpf_prog_select_runtime(prog); + if (err < 0) + goto free_used_maps; err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); if (err < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47dcd3aa6e23..cfd9a40b9a5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id) fn->ret_type, func_id); return -EINVAL; } + + if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + func_id != BPF_FUNC_tail_call) + /* prog_array map type needs extra care: + * only allow to pass it into bpf_tail_call() for now. + * bpf_map_delete_elem() can be allowed in the future, + * while bpf_map_update_elem() must only be done via syscall + */ + return -EINVAL; + + if (func_id == BPF_FUNC_tail_call && + map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + /* don't allow any other map type to be passed into + * bpf_tail_call() + */ + return -EINVAL; + return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2d56ce501632..646445e41bd4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; case BPF_FUNC_trace_printk: /* diff --git a/net/core/filter.c b/net/core/filter.c index 6805717be614..3adcca6f17a4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; default: return NULL; } diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 8fdbd73429dd..46c6a8cf74d3 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -6,29 +6,35 @@ hostprogs-y := test_verifier test_maps hostprogs-y += sock_example hostprogs-y += sockex1 hostprogs-y += sockex2 +hostprogs-y += sockex3 hostprogs-y += tracex1 hostprogs-y += tracex2 hostprogs-y += tracex3 hostprogs-y += tracex4 +hostprogs-y += tracex5 test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o sock_example-objs := sock_example.o libbpf.o sockex1-objs := bpf_load.o libbpf.o sockex1_user.o sockex2-objs := bpf_load.o libbpf.o sockex2_user.o +sockex3-objs := bpf_load.o libbpf.o sockex3_user.o tracex1-objs := bpf_load.o libbpf.o tracex1_user.o tracex2-objs := bpf_load.o libbpf.o tracex2_user.o tracex3-objs := bpf_load.o libbpf.o tracex3_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o +tracex5-objs := bpf_load.o libbpf.o tracex5_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) always += sockex1_kern.o always += sockex2_kern.o +always += sockex3_kern.o always += tracex1_kern.o always += tracex2_kern.o always += tracex3_kern.o always += tracex4_kern.o +always += tracex5_kern.o always += tcbpf1_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -36,10 +42,12 @@ HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable HOSTLOADLIBES_sockex1 += -lelf HOSTLOADLIBES_sockex2 += -lelf +HOSTLOADLIBES_sockex3 += -lelf HOSTLOADLIBES_tracex1 += -lelf HOSTLOADLIBES_tracex2 += -lelf HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt +HOSTLOADLIBES_tracex5 += -lelf # point this to your LLVM backend with bpf support LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index f960b5fb3ed8..f531a0b3282d 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -21,6 +21,10 @@ static unsigned long long (*bpf_ktime_get_ns)(void) = (void *) BPF_FUNC_ktime_get_ns; static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = (void *) BPF_FUNC_trace_printk; +static void (*bpf_tail_call)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_tail_call; +static unsigned long long (*bpf_get_smp_processor_id)(void) = + (void *) BPF_FUNC_get_smp_processor_id; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 38dac5a53b51..da86a8e0a95a 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -16,6 +16,7 @@ #include <sys/ioctl.h> #include <sys/mman.h> #include <poll.h> +#include <ctype.h> #include "libbpf.h" #include "bpf_helpers.h" #include "bpf_load.h" @@ -29,6 +30,19 @@ int map_fd[MAX_MAPS]; int prog_fd[MAX_PROGS]; int event_fd[MAX_PROGS]; int prog_cnt; +int prog_array_fd = -1; + +static int populate_prog_array(const char *event, int prog_fd) +{ + int ind = atoi(event), err; + + err = bpf_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); + if (err < 0) { + printf("failed to store prog_fd in prog_array\n"); + return -1; + } + return 0; +} static int load_and_attach(const char *event, struct bpf_insn *prog, int size) { @@ -54,12 +68,40 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return -1; } + fd = bpf_prog_load(prog_type, prog, size, license, kern_version); + if (fd < 0) { + printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); + return -1; + } + + prog_fd[prog_cnt++] = fd; + + if (is_socket) { + event += 6; + if (*event != '/') + return 0; + event++; + if (!isdigit(*event)) { + printf("invalid prog number\n"); + return -1; + } + return populate_prog_array(event, fd); + } + if (is_kprobe || is_kretprobe) { if (is_kprobe) event += 7; else event += 10; + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + + if (isdigit(*event)) + return populate_prog_array(event, fd); + snprintf(buf, sizeof(buf), "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", is_kprobe ? 'p' : 'r', event, event); @@ -71,18 +113,6 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) } } - fd = bpf_prog_load(prog_type, prog, size, license, kern_version); - - if (fd < 0) { - printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); - return -1; - } - - prog_fd[prog_cnt++] = fd; - - if (is_socket) - return 0; - strcpy(buf, DEBUGFS); strcat(buf, "events/kprobes/"); strcat(buf, event); @@ -130,6 +160,9 @@ static int load_maps(struct bpf_map_def *maps, int len) maps[i].max_entries); if (map_fd[i] < 0) return 1; + + if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + prog_array_fd = map_fd[i]; } return 0; } diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c new file mode 100644 index 000000000000..2625b987944f --- /dev/null +++ b/samples/bpf/sockex3_kern.c @@ -0,0 +1,303 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" +#include <uapi/linux/in.h> +#include <uapi/linux/if.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/if_tunnel.h> +#include <uapi/linux/mpls.h> +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F + +struct bpf_map_def SEC("maps") jmp_table = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 8, +}; + +#define PARSE_VLAN 1 +#define PARSE_MPLS 2 +#define PARSE_IP 3 +#define PARSE_IPV6 4 + +/* protocol dispatch routine. + * It tail-calls next BPF program depending on eth proto + * Note, we could have used: + * bpf_tail_call(skb, &jmp_table, proto); + * but it would need large prog_array + */ +static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) +{ + switch (proto) { + case ETH_P_8021Q: + case ETH_P_8021AD: + bpf_tail_call(skb, &jmp_table, PARSE_VLAN); + break; + case ETH_P_MPLS_UC: + case ETH_P_MPLS_MC: + bpf_tail_call(skb, &jmp_table, PARSE_MPLS); + break; + case ETH_P_IP: + bpf_tail_call(skb, &jmp_table, PARSE_IP); + break; + case ETH_P_IPV6: + bpf_tail_call(skb, &jmp_table, PARSE_IPV6); + break; + } +} + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct flow_keys { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u32 ip_proto; +}; + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) +{ + __u64 w0 = load_word(ctx, off); + __u64 w1 = load_word(ctx, off + 4); + __u64 w2 = load_word(ctx, off + 8); + __u64 w3 = load_word(ctx, off + 12); + + return (__u32)(w0 ^ w1 ^ w2 ^ w3); +} + +struct globals { + struct flow_keys flow; + __u32 nhoff; +}; + +struct bpf_map_def SEC("maps") percpu_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct globals), + .max_entries = 32, +}; + +/* user poor man's per_cpu until native support is ready */ +static struct globals *this_cpu_globals(void) +{ + u32 key = bpf_get_smp_processor_id(); + + return bpf_map_lookup_elem(&percpu_map, &key); +} + +/* some simple stats for user space consumption */ +struct pair { + __u64 packets; + __u64 bytes; +}; + +struct bpf_map_def SEC("maps") hash_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct flow_keys), + .value_size = sizeof(struct pair), + .max_entries = 1024, +}; + +static void update_stats(struct __sk_buff *skb, struct globals *g) +{ + struct flow_keys key = g->flow; + struct pair *value; + + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) { + __sync_fetch_and_add(&value->packets, 1); + __sync_fetch_and_add(&value->bytes, skb->len); + } else { + struct pair val = {1, skb->len}; + + bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); + } +} + +static __always_inline void parse_ip_proto(struct __sk_buff *skb, + struct globals *g, __u32 ip_proto) +{ + __u32 nhoff = g->nhoff; + int poff; + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + }; + + __u32 gre_flags = load_half(skb, + nhoff + offsetof(struct gre_hdr, flags)); + __u32 gre_proto = load_half(skb, + nhoff + offsetof(struct gre_hdr, proto)); + + if (gre_flags & (GRE_VERSION|GRE_ROUTING)) + break; + + nhoff += 4; + if (gre_flags & GRE_CSUM) + nhoff += 4; + if (gre_flags & GRE_KEY) + nhoff += 4; + if (gre_flags & GRE_SEQ) + nhoff += 4; + + g->nhoff = nhoff; + parse_eth_proto(skb, gre_proto); + break; + } + case IPPROTO_IPIP: + parse_eth_proto(skb, ETH_P_IP); + break; + case IPPROTO_IPV6: + parse_eth_proto(skb, ETH_P_IPV6); + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + g->flow.ports = load_word(skb, nhoff); + case IPPROTO_ICMP: + g->flow.ip_proto = ip_proto; + update_stats(skb, g); + break; + default: + break; + } +} + +PROG(PARSE_IP)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, verlen, ip_proto; + + if (!g) + return 0; + + nhoff = g->nhoff; + + if (unlikely(ip_is_fragment(skb, nhoff))) + return 0; + + ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); + + if (ip_proto != IPPROTO_GRE) { + g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); + g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); + } + + verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); + nhoff += (verlen & 0xF) << 2; + + g->nhoff = nhoff; + parse_ip_proto(skb, g, ip_proto); + return 0; +} + +PROG(PARSE_IPV6)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, ip_proto; + + if (!g) + return 0; + + nhoff = g->nhoff; + + ip_proto = load_byte(skb, + nhoff + offsetof(struct ipv6hdr, nexthdr)); + g->flow.src = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, saddr)); + g->flow.dst = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, daddr)); + nhoff += sizeof(struct ipv6hdr); + + g->nhoff = nhoff; + parse_ip_proto(skb, g, ip_proto); + return 0; +} + +PROG(PARSE_VLAN)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, proto; + + if (!g) + return 0; + + nhoff = g->nhoff; + + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + g->nhoff = nhoff; + + parse_eth_proto(skb, proto); + + return 0; +} + +PROG(PARSE_MPLS)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, label; + + if (!g) + return 0; + + nhoff = g->nhoff; + + label = load_word(skb, nhoff); + nhoff += sizeof(struct mpls_label); + g->nhoff = nhoff; + + if (label & MPLS_LS_S_MASK) { + __u8 verlen = load_byte(skb, nhoff); + if ((verlen & 0xF0) == 4) + parse_eth_proto(skb, ETH_P_IP); + else + parse_eth_proto(skb, ETH_P_IPV6); + } else { + parse_eth_proto(skb, ETH_P_MPLS_UC); + } + + return 0; +} + +SEC("socket/0") +int main_prog(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff = ETH_HLEN; + __u32 proto = load_half(skb, 12); + + if (!g) + return 0; + + g->nhoff = nhoff; + parse_eth_proto(skb, proto); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c new file mode 100644 index 000000000000..2617772d060d --- /dev/null +++ b/samples/bpf/sockex3_user.c @@ -0,0 +1,66 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <unistd.h> +#include <arpa/inet.h> + +struct flow_keys { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u32 ip_proto; +}; + +struct pair { + __u64 packets; + __u64 bytes; +}; + +int main(int argc, char **argv) +{ + char filename[256]; + FILE *f; + int i, sock; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4], + sizeof(__u32)) == 0); + + if (argc > 1) + f = popen("ping -c5 localhost", "r"); + else + f = popen("netperf -l 4 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + struct flow_keys key = {}, next_key; + struct pair value; + + sleep(1); + printf("IP src.port -> dst.port bytes packets\n"); + while (bpf_get_next_key(map_fd[2], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[2], &next_key, &value); + printf("%s.%05d -> %s.%05d %12lld %12lld\n", + inet_ntoa((struct in_addr){htonl(next_key.src)}), + next_key.port16[0], + inet_ntoa((struct in_addr){htonl(next_key.dst)}), + next_key.port16[1], + value.bytes, value.packets); + key = next_key; + } + } + return 0; +} diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c new file mode 100644 index 000000000000..b71fe07a7a7a --- /dev/null +++ b/samples/bpf/tracex5_kern.c @@ -0,0 +1,75 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/seccomp.h> +#include "bpf_helpers.h" + +#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F + +struct bpf_map_def SEC("maps") progs = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1024, +}; + +SEC("kprobe/seccomp_phase1") +int bpf_prog1(struct pt_regs *ctx) +{ + struct seccomp_data sd = {}; + + bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di); + + /* dispatch into next BPF program depending on syscall number */ + bpf_tail_call(ctx, &progs, sd.nr); + + /* fall through -> unknown syscall */ + if (sd.nr >= __NR_getuid && sd.nr <= __NR_getsid) { + char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n"; + bpf_trace_printk(fmt, sizeof(fmt), sd.nr); + } + return 0; +} + +/* we jump here when syscall number == __NR_write */ +PROG(__NR_write)(struct pt_regs *ctx) +{ + struct seccomp_data sd = {}; + + bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di); + if (sd.args[2] == 512) { + char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; + bpf_trace_printk(fmt, sizeof(fmt), + sd.args[0], sd.args[1], sd.args[2]); + } + return 0; +} + +PROG(__NR_read)(struct pt_regs *ctx) +{ + struct seccomp_data sd = {}; + + bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di); + if (sd.args[2] > 128 && sd.args[2] <= 1024) { + char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; + bpf_trace_printk(fmt, sizeof(fmt), + sd.args[0], sd.args[1], sd.args[2]); + } + return 0; +} + +PROG(__NR_mmap)(struct pt_regs *ctx) +{ + char fmt[] = "mmap\n"; + bpf_trace_printk(fmt, sizeof(fmt)); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c new file mode 100644 index 000000000000..a04dd3cd4358 --- /dev/null +++ b/samples/bpf/tracex5_user.c @@ -0,0 +1,46 @@ +#include <stdio.h> +#include <linux/bpf.h> +#include <unistd.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <sys/prctl.h> +#include "libbpf.h" +#include "bpf_load.h" + +/* install fake seccomp program to enable seccomp code path inside the kernel, + * so that our kprobe attached to seccomp_phase1() can be triggered + */ +static void install_accept_all_seccomp(void) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + if (prctl(PR_SET_SECCOMP, 2, &prog)) + perror("prctl"); +} + +int main(int ac, char **argv) +{ + FILE *f; + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + install_accept_all_seccomp(); + + f = popen("dd if=/dev/zero of=/dev/null count=5", "r"); + (void) f; + + read_trace_pipe(); + + return 0; +} |