From dbcfe5f76dd5266b8f308b5a8f9ef52f74b2d6e7 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Mon, 9 Jan 2017 10:19:46 -0800 Subject: bpf: split check_mem_access logic for map values Move the logic to check memory accesses to a PTR_TO_MAP_VALUE_ADJ from check_mem_access() to a separate helper check_map_access_adj(). This enables to use those checks in other parts of the verifier as well, where boundaries on PTR_TO_MAP_VALUE_ADJ might need to be checked, for example when checking helper function arguments. The same thing is already happening for other types such as PTR_TO_PACKET and its check_packet_access() helper. The code has been copied verbatim, with the only difference of removing the "off += reg->max_value" statement and moving the sum into the call statement to check_map_access(), as that was only needed due to the earlier common check_map_access() call. Signed-off-by: Gianluca Borello Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 88 ++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 83ed2f8f6f22..8333fbcfbfe7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -635,6 +635,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } +/* check read/write into an adjusted map element */ +static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, + int off, int size) +{ + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; + int err; + + /* We adjusted the register to this map value, so we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + return check_map_access(env, regno, reg->max_value + off, size); +} + #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, @@ -775,45 +820,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - /* If we adjusted the register to this map value at all then we - * need to change off and size to min_value and max_value - * respectively to make sure our theoretical access will be - * safe. - */ - if (reg->type == PTR_TO_MAP_VALUE_ADJ) { - if (log_level) - print_verifier_state(state); - env->varlen_map_value_access = true; - /* The minimum value is only important with signed - * comparisons where we can't assume the floor of a - * value is 0. If we are using signed variables for our - * index'es we need to make sure that whatever we use - * will have a set floor within our range. - */ - if (reg->min_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; - } - err = check_map_access(env, regno, reg->min_value + off, - size); - if (err) { - verbose("R%d min value is outside of the array range\n", - regno); - return err; - } - - /* If we haven't set a max value then we need to bail - * since we can't be sure we won't do bad things. - */ - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", - regno); - return -EACCES; - } - off += reg->max_value; - } - err = check_map_access(env, regno, off, size); + if (reg->type == PTR_TO_MAP_VALUE_ADJ) + err = check_map_access_adj(env, regno, off, size); + else + err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); -- cgit v1.2.3 From 5722569bb9c3bd922c4f10b5b2912fe88c255312 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Mon, 9 Jan 2017 10:19:47 -0800 Subject: bpf: allow helpers access to map element values Enable helpers to directly access a map element value by passing a register type PTR_TO_MAP_VALUE (or PTR_TO_MAP_VALUE_ADJ) to helper arguments ARG_PTR_TO_STACK or ARG_PTR_TO_RAW_STACK. This enables several use cases. For example, a typical tracing program might want to capture pathnames passed to sys_open() with: struct trace_data { char pathname[PATHLEN]; }; SEC("kprobe/sys_open") void bpf_sys_open(struct pt_regs *ctx) { struct trace_data data; bpf_probe_read(data.pathname, sizeof(data.pathname), ctx->di); /* consume data.pathname, for example via * bpf_trace_printk() or bpf_perf_event_output() */ } Such a program could easily hit the stack limit in case PATHLEN needs to be large or more local variables need to exist, both of which are quite common scenarios. Allowing direct helper access to map element values, one could do: struct bpf_map_def SEC("maps") scratch_map = { .type = BPF_MAP_TYPE_PERCPU_ARRAY, .key_size = sizeof(u32), .value_size = sizeof(struct trace_data), .max_entries = 1, }; SEC("kprobe/sys_open") int bpf_sys_open(struct pt_regs *ctx) { int id = 0; struct trace_data *p = bpf_map_lookup_elem(&scratch_map, &id); if (!p) return; bpf_probe_read(p->pathname, sizeof(p->pathname), ctx->di); /* consume p->pathname, for example via * bpf_trace_printk() or bpf_perf_event_output() */ } And wouldn't risk exhausting the stack. Code changes are loosely modeled after commit 6841de8b0d03 ("bpf: allow helpers access the packet directly"). Unlike with PTR_TO_PACKET, these changes just work with ARG_PTR_TO_STACK and ARG_PTR_TO_RAW_STACK (not ARG_PTR_TO_MAP_KEY, ARG_PTR_TO_MAP_VALUE, ...): adding those would be trivial, but since there is not currently a use case for that, it's reasonable to limit the set of changes. Also, add new tests to make sure accesses to map element values from helpers never go out of boundary, even when adjusted. Signed-off-by: Gianluca Borello Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 9 +- tools/testing/selftests/bpf/test_verifier.c | 491 ++++++++++++++++++++++++++++ 2 files changed, 498 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8333fbcfbfe7..b7014606745b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -627,7 +627,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; - if (off < 0 || off + size > map->value_size) { + if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -1025,7 +1025,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (type == CONST_IMM && reg->imm == 0) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != expected_type) + else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { @@ -1088,6 +1089,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (regs[regno - 1].type == PTR_TO_PACKET) err = check_packet_access(env, regno - 1, 0, reg->imm); + else if (regs[regno - 1].type == PTR_TO_MAP_VALUE) + err = check_map_access(env, regno - 1, 0, reg->imm); + else if (regs[regno - 1].type == PTR_TO_MAP_VALUE_ADJ) + err = check_map_access_adj(env, regno - 1, 0, reg->imm); else err = check_stack_boundary(env, regno - 1, reg->imm, zero_size_allowed, meta); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 853d7e43434a..b7732e557bf9 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2905,6 +2905,497 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "invalid bpf_context access", }, + { + "helper access to map: full range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to map: partial range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to map: empty range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=0 size=0", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to map: out-of-bound range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=0 size=56", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to map: negative range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=0 size=-8", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const imm): full range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_2, + sizeof(struct test_val) - + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const imm): partial range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const imm): empty range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is outside of the array range", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const imm): out-of-bound range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_2, + sizeof(struct test_val) - + offsetof(struct test_val, foo) + 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=4 size=52", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const imm): negative range (> adjustment)", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=4 size=-8", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const imm): negative range (< adjustment)", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_2, -1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is outside of the array range", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const reg): full range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, + offsetof(struct test_val, foo)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, + sizeof(struct test_val) - + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const reg): partial range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, + offsetof(struct test_val, foo)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const reg): empty range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is outside of the array range", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const reg): out-of-bound range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, + offsetof(struct test_val, foo)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, + sizeof(struct test_val) - + offsetof(struct test_val, foo) + 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=4 size=52", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const reg): negative range (> adjustment)", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, + offsetof(struct test_val, foo)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=4 size=-8", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via const reg): negative range (< adjustment)", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, + offsetof(struct test_val, foo)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, -1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is outside of the array range", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via variable): full range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, + offsetof(struct test_val, foo), 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, + sizeof(struct test_val) - + offsetof(struct test_val, foo)), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via variable): partial range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, + offsetof(struct test_val, foo), 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via variable): empty range", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, + offsetof(struct test_val, foo), 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is outside of the array range", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via variable): no max check", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is negative, either use unsigned index or do a if (index >=0) check", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to adjusted map (via variable): wrong max check", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, + offsetof(struct test_val, foo), 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, + sizeof(struct test_val) - + offsetof(struct test_val, foo) + 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=4 size=45", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From f0318d01b694485af9678a4e120328ae3555be6d Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Mon, 9 Jan 2017 10:19:48 -0800 Subject: bpf: allow adjusted map element values to spill commit 484611357c19 ("bpf: allow access into map value arrays") introduces the ability to do pointer math inside a map element value via the PTR_TO_MAP_VALUE_ADJ register type. The current support doesn't handle the case where a PTR_TO_MAP_VALUE_ADJ is spilled into the stack, limiting several use cases, especially when generating bpf code from a compiler. Handle this case by explicitly enabling the register type PTR_TO_MAP_VALUE_ADJ to be spilled. Also, make sure that min_value and max_value are reset just for BPF_LDX operations that don't result in a restore of a spilled register from stack. Signed-off-by: Gianluca Borello Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 21 +++++++++---- tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b7014606745b..59ed07b9b4ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) regs[regno].max_value = BPF_REGISTER_MAX_RANGE; } +static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, + u32 regno) +{ + mark_reg_unknown_value(regs, regno); + reset_reg_range_values(regs, regno); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) switch (type) { case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_MAP_VALUE_ADJ: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); return 0; } } @@ -825,7 +834,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, else err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = UNKNOWN_VALUE; @@ -837,7 +847,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; } @@ -870,7 +881,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -2744,7 +2756,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - reset_reg_range_values(regs, insn->dst_reg); if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b7732e557bf9..e7b075819c08 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3396,6 +3396,52 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "map element value is preserved across register spilling", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -184), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result = ACCEPT, + .result_unpriv = REJECT, + }, + { + "map element value (adjusted) is preserved across register spilling", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct test_val, foo)), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -184), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr_unpriv = "R0 pointer arithmetic prohibited", + .result = ACCEPT, + .result_unpriv = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From 06c1c049721a995dee2829ad13b24aaf5d7c5cce Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Mon, 9 Jan 2017 10:19:49 -0800 Subject: bpf: allow helpers access to variable memory Currently, helpers that read and write from/to the stack can do so using a pair of arguments of type ARG_PTR_TO_STACK and ARG_CONST_STACK_SIZE. ARG_CONST_STACK_SIZE accepts a constant register of type CONST_IMM, so that the verifier can safely check the memory access. However, requiring the argument to be a constant can be limiting in some circumstances. Since the current logic keeps track of the minimum and maximum value of a register throughout the simulated execution, ARG_CONST_STACK_SIZE can be changed to also accept an UNKNOWN_VALUE register in case its boundaries have been set and the range doesn't cause invalid memory accesses. One common situation when this is useful: int len; char buf[BUFSIZE]; /* BUFSIZE is 128 */ if (some_condition) len = 42; else len = 84; some_helper(..., buf, len & (BUFSIZE - 1)); The compiler can often decide to assign the constant values 42 or 48 into a variable on the stack, instead of keeping it in a register. When the variable is then read back from stack into the register in order to be passed to the helper, the verifier will not be able to recognize the register as constant (the verifier is not currently tracking all constant writes into memory), and the program won't be valid. However, by allowing the helper to accept an UNKNOWN_VALUE register, this program will work because the bitwise AND operation will set the range of possible values for the UNKNOWN_VALUE register to [0, BUFSIZE), so the verifier can guarantee the helper call will be safe (assuming the argument is of type ARG_CONST_STACK_SIZE_OR_ZERO, otherwise one more check against 0 would be needed). Custom ranges can be set not only with ALU operations, but also by explicitly comparing the UNKNOWN_VALUE register with constants. Another very common example happens when intercepting system call arguments and accessing user-provided data of variable size using bpf_probe_read(). One can load at runtime the user-provided length in an UNKNOWN_VALUE register, and then read that exact amount of data up to a compile-time determined limit in order to fit into the proper local storage allocated on the stack, without having to guess a suboptimal access size at compile time. Also, in case the helpers accepting the UNKNOWN_VALUE register operate in raw mode, disable the raw mode so that the program is required to initialize all memory, since there is no guarantee the helper will fill it completely, leaving possibilities for data leak (just relevant when the memory used by the helper is the stack, not when using a pointer to map element value or packet). In other words, ARG_PTR_TO_RAW_STACK will be treated as ARG_PTR_TO_STACK. Signed-off-by: Gianluca Borello Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 74 ++++- tools/testing/selftests/bpf/test_verifier.c | 410 ++++++++++++++++++++++++++++ 2 files changed, 474 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 59ed07b9b4ea..3d4f7bf32aaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -980,6 +980,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } +static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + + switch (regs[regno].type) { + case PTR_TO_PACKET: + return check_packet_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE: + return check_map_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE_ADJ: + return check_map_access_adj(env, regno, 0, access_size); + default: /* const_imm|ptr_to_stack or invalid ptr */ + return check_stack_boundary(env, regno, access_size, + zero_size_allowed, meta); + } +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1018,7 +1037,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; - if (type != expected_type) + /* One exception. Allow UNKNOWN_VALUE registers when the + * boundaries are known and don't cause unsafe memory accesses + */ + if (type != UNKNOWN_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; @@ -1099,15 +1121,47 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - if (regs[regno - 1].type == PTR_TO_PACKET) - err = check_packet_access(env, regno - 1, 0, reg->imm); - else if (regs[regno - 1].type == PTR_TO_MAP_VALUE) - err = check_map_access(env, regno - 1, 0, reg->imm); - else if (regs[regno - 1].type == PTR_TO_MAP_VALUE_ADJ) - err = check_map_access_adj(env, regno - 1, 0, reg->imm); - else - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + + /* If the register is UNKNOWN_VALUE, the access check happens + * using its boundaries. Otherwise, just use its imm + */ + if (type == UNKNOWN_VALUE) { + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->min_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->max_value, + zero_size_allowed, meta); + if (err) + return err; + } else { + /* register is CONST_IMM */ + err = check_helper_mem_access(env, regno - 1, reg->imm, + zero_size_allowed, meta); + } } return err; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index e7b075819c08..9bb45346dc72 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3442,6 +3442,416 @@ static struct bpf_test tests[] = { .result = ACCEPT, .result_unpriv = REJECT, }, + { + "helper access to variable memory: stack, bitwise AND + JMP, correct bounds", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, bitwise AND, zero included", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-64 access_size=0", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, bitwise AND + JMP, wrong max", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 65), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-64 access_size=65", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP, correct bounds", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP (signed), correct bounds", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP, bounds + offset", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 5), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-64 access_size=65", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP, wrong max", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 65, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-64 access_size=65", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP, no max check", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R2 unbounded memory access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP, no min check", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-64 access_size=0", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: stack, JMP (signed), no min check", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 16), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R2 min value is negative", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: map, JMP, correct bounds", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, + sizeof(struct test_val), 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: map, JMP, wrong max", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, + sizeof(struct test_val) + 1, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "invalid access to map value, value_size=48 off=0 size=49", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: map adjusted, JMP, correct bounds", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20), + BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, + sizeof(struct test_val) - 20, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: map adjusted, JMP, wrong max", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20), + BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, + sizeof(struct test_val) - 19, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map2 = { 3 }, + .errstr = "R1 min value is outside of the array range", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: size > 0 not allowed on NULL", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_csum_diff), + BPF_EXIT_INSN(), + }, + .errstr = "R1 type=imm expected=fp", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "helper access to variable memory: size = 0 not allowed on != NULL", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_csum_diff), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-8 access_size=0", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "helper access to variable memory: 8 bytes leak", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), + BPF_EXIT_INSN(), + }, + .errstr = "invalid indirect read from stack off -64+32 size 64", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: 8 bytes no leak (init memory)", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From 39f19ebbf57b403695f7b5f9cf322fe1ddb5d7fb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 9 Jan 2017 10:19:50 -0800 Subject: bpf: rename ARG_PTR_TO_STACK since ARG_PTR_TO_STACK is no longer just pointer to stack rename it to ARG_PTR_TO_MEM and adjust comment. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 12 ++++++------ kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 28 ++++++++++++++-------------- kernel/trace/bpf_trace.c | 20 ++++++++++---------- net/core/filter.c | 40 ++++++++++++++++++++-------------------- 5 files changed, 52 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f74ae68086dc..94ea8d2383e6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -69,14 +69,14 @@ enum bpf_arg_type { /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ - ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ - ARG_PTR_TO_RAW_STACK, /* any pointer to eBPF program stack, area does not - * need to be initialized, helper function must fill - * all bytes or clear them in error case. + ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ + ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, + * helper function must fill all bytes or clear + * them in error case. */ - ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ - ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */ + ARG_CONST_SIZE, /* number of bytes accessed from memory */ + ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 045cbe673356..3d24e238221e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3d4f7bf32aaf..2efdc9128e3c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1034,8 +1034,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_STACK; if (type != PTR_TO_PACKET && type != expected_type) goto err_type; - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { expected_type = CONST_IMM; /* One exception. Allow UNKNOWN_VALUE registers when the * boundaries are known and don't cause unsafe memory accesses @@ -1050,8 +1050,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_STACK || - arg_type == ARG_PTR_TO_RAW_STACK) { + } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -1062,7 +1062,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; + meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -1108,9 +1108,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { - bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it @@ -1118,7 +1118,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + verbose("ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1235,15 +1235,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn) { int count = 0; - if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; return count > 1 ? -EINVAL : 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fa77311dadb2..f883c43c96f3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, }; @@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto *bpf_get_probe_write_proto(void) @@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_trace_printk_proto(void) @@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); @@ -492,8 +492,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, diff --git a/net/core/filter.c b/net/core/filter.c index 1969b3f118c1..f4d16a905754 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1416,8 +1416,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1447,8 +1447,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_RAW_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -1601,10 +1601,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2306,8 +2306,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -2377,8 +2377,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2412,8 +2412,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; @@ -2483,8 +2483,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2509,8 +2509,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * @@ -2593,8 +2593,8 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * -- cgit v1.2.3 From a5ef01aaac245d37edf113d65b0c146e96d841d1 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jan 2017 15:02:07 +0100 Subject: bpf: Remove unused but set variable in __bpf_lru_list_shrink_inactive() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the unused but set variable 'first_node' in __bpf_lru_list_shrink_inactive() to fix the following GCC warning when building with 'W=1': kernel/bpf/bpf_lru_list.c:216:41: warning: variable ‘first_node’ set but not used [-Wunused-but-set-variable] Cc: Martin KaFai Lau Signed-off-by: Tobias Klauser Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/bpf_lru_list.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 89b7ef41c86b..d78501ee0609 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - struct bpf_lru_node *node, *tmp_node, *first_node; + struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; - first_node = list_first_entry(inactive, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); -- cgit v1.2.3 From 3bf003335ba356aac5a43e28640159d4ae8a2a60 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jan 2017 15:02:16 +0100 Subject: bpf: Make unnecessarily global functions static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the functions __local_list_pop_free(), __local_list_pop_pending(), bpf_common_lru_populate() and bpf_percpu_lru_populate() static as they are not used outide of bpf_lru_list.c This fixes the following GCC warnings when building with 'W=1': kernel/bpf/bpf_lru_list.c:363:22: warning: no previous prototype for ‘__local_list_pop_free’ [-Wmissing-prototypes] kernel/bpf/bpf_lru_list.c:376:22: warning: no previous prototype for ‘__local_list_pop_pending’ [-Wmissing-prototypes] kernel/bpf/bpf_lru_list.c:560:6: warning: no previous prototype for ‘bpf_common_lru_populate’ [-Wmissing-prototypes] kernel/bpf/bpf_lru_list.c:577:6: warning: no previous prototype for ‘bpf_percpu_lru_populate’ [-Wmissing-prototypes] Cc: Martin KaFai Lau Signed-off-by: Tobias Klauser Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/bpf_lru_list.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index d78501ee0609..f62d1d56f41d 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -360,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru, list_add(&node->list, local_pending_list(loc_l)); } -struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; @@ -373,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) return node; } -struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, - struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; @@ -557,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) bpf_common_lru_push_free(lru, node); } -void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; @@ -574,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, } } -void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { u32 i, pcpu_entries; int cpu; -- cgit v1.2.3 From 6b8cc1d11ef75c5b9c530b3d0d148f3c2dd25f93 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Jan 2017 11:51:32 +0100 Subject: bpf: pass original insn directly to convert_ctx_access Currently, when calling convert_ctx_access() callback for the various program types, we pass in insn->dst_reg, insn->src_reg, insn->off from the original instruction. This information is needed to rewrite the instruction that is based on the user ctx structure into a kernel representation for the ctx. As we'd like to allow access size beyond just BPF_W, we'd need also insn->code for that in order to decode the original access size. Given that, lets just pass insn directly to the convert_ctx_access() callback and work on that to not clutter the callback with even more arguments we need to pass when everything is already contained in insn. So lets go through that once, no functional change. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 ++- kernel/bpf/verifier.c | 3 +- kernel/trace/bpf_trace.c | 15 ++--- net/core/filter.c | 139 +++++++++++++++++++++++++---------------------- 4 files changed, 87 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 94ea8d2383e6..f8c3560b01db 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -161,9 +161,10 @@ struct bpf_verifier_ops { enum bpf_reg_type *reg_type); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); - u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn, struct bpf_prog *prog); + u32 (*convert_ctx_access)(enum bpf_access_type type, + const struct bpf_insn *src, + struct bpf_insn *dst, + struct bpf_prog *prog); }; struct bpf_prog_type_list { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2efdc9128e3c..df7e47244e75 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3177,8 +3177,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) continue; - cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f883c43c96f3..1860e7f1e5a8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -572,28 +572,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - data), dst_reg, src_reg, + data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); - *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, offsetof(struct perf_sample_data, period)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - regs), dst_reg, src_reg, + regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, + si->off); break; } diff --git a/net/core/filter.c b/net/core/filter.c index f4d16a905754..8cfbdefbfb1c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2972,32 +2972,33 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; + int off; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, len): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct __sk_buff, protocol): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, protocol)); break; case offsetof(struct __sk_buff, vlan_proto): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, vlan_proto)); break; @@ -3005,17 +3006,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, priority)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, priority)); break; case offsetof(struct __sk_buff, ingress_ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, skb_iif)); break; @@ -3023,17 +3024,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct __sk_buff, hash): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, hash)); break; @@ -3041,63 +3042,74 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, mark)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, mark)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, + si->src_reg, insn); case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, + si->src_reg, insn); case offsetof(struct __sk_buff, vlan_present): return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - dst_reg, src_reg, insn); + si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, vlan_tci): return convert_skb_access(SKF_AD_VLAN_TAG, - dst_reg, src_reg, insn); + si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); prog->cb_access = 1; - ctx_off -= offsetof(struct __sk_buff, cb[0]); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, data); + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): - ctx_off -= offsetof(struct __sk_buff, tc_classid); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); + + off = si->off; + off -= offsetof(struct __sk_buff, tc_classid); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, tc_classid); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_end): - ctx_off -= offsetof(struct __sk_buff, data_end); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, - ctx_off); + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): @@ -3105,110 +3117,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, tc_index)); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, tc_index)); - break; #else if (type == BPF_WRITE) - *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else - *insn++ = BPF_MOV64_IMM(dst_reg, 0); - break; + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif + break; } return insn - insn_buf; } static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - int dst_reg, int src_reg, - int ctx_off, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sock, sk_family)); break; case offsetof(struct bpf_sock, type): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); break; case offsetof(struct bpf_sock, protocol): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; } return insn - insn_buf; } -static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; default: - return sk_filter_convert_ctx_access(type, dst_reg, src_reg, - ctx_off, insn_buf, prog); + return sk_filter_convert_ctx_access(type, si, insn_buf, prog); } return insn - insn_buf; } -static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 xdp_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; } -- cgit v1.2.3 From 62c7989b24dbd348c2507ee6458ebf5637d6ddb5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Jan 2017 11:51:33 +0100 Subject: bpf: allow b/h/w/dw access for bpf's cb in ctx When structs are used to store temporary state in cb[] buffer that is used with programs and among tail calls, then the generated code will not always access the buffer in bpf_w chunks. We can ease programming of it and let this act more natural by allowing for aligned b/h/w/dw sized access for cb[] ctx member. Various test cases are attached as well for the selftest suite. Potentially, this can also be reused for other program types to pass data around. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 8 +- net/core/filter.c | 41 ++- tools/testing/selftests/bpf/test_verifier.c | 442 +++++++++++++++++++++++++++- 3 files changed, 478 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df7e47244e75..d60e12c67266 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3165,10 +3165,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else diff --git a/net/core/filter.c b/net/core/filter.c index 8cfbdefbfb1c..90383860e224 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2776,11 +2776,33 @@ static bool __is_valid_access(int off, int size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + if (size == sizeof(__u16) && + off > offsetof(struct __sk_buff, cb[4]) + sizeof(__u16)) + return false; + if (size == sizeof(__u32) && + off > offsetof(struct __sk_buff, cb[4])) + return false; + if (size == sizeof(__u64) && + off > offsetof(struct __sk_buff, cb[2])) + return false; + if (size != sizeof(__u8) && + size != sizeof(__u16) && + size != sizeof(__u32) && + size != sizeof(__u64)) + return false; + break; + default: + if (size != sizeof(__u32)) + return false; + } return true; } @@ -2799,7 +2821,7 @@ static bool sk_filter_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: break; default: return false; @@ -2823,7 +2845,7 @@ static bool lwt_is_valid_access(int off, int size, case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: break; default: return false; @@ -2915,7 +2937,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: case offsetof(struct __sk_buff, tc_classid): break; default: @@ -3066,8 +3088,11 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct qdisc_skb_cb, data)) % + sizeof(__u64)); prog->cb_access = 1; off = si->off; @@ -3075,10 +3100,10 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 9bb45346dc72..1aa73241c999 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -859,15 +859,451 @@ static struct bpf_test tests[] = { .result = REJECT, }, { - "check non-u32 access to cb", + "check cb access: byte", .insns = { - BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_1, + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) + 1), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) + 2), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) + 3), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1])), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1]) + 1), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1]) + 2), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1]) + 3), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2])), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2]) + 1), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2]) + 2), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2]) + 3), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3])), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3]) + 1), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3]) + 2), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3]) + 3), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4])), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 1), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 2), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) + 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) + 2), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) + 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1])), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1]) + 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1]) + 2), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1]) + 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2])), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2]) + 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2]) + 2), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2]) + 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3])), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3]) + 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3]) + 2), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3]) + 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4])), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 2), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 3), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check cb access: byte, oob 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 4), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: byte, oob 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) - 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: byte, oob 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 4), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: byte, oob 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) - 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: byte, wrong type", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "check cb access: half", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) + 2), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1])), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1]) + 2), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2])), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2]) + 2), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3])), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3]) + 2), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4])), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 2), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) + 2), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1])), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1]) + 2), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2])), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2]) + 2), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3])), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3]) + 2), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4])), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check cb access: half, unaligned", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) + 1), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: half, oob 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 4), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: half, oob 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) - 2), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: half, oob 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 4), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: half, oob 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) - 2), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: half, wrong type", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "check cb access: word", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1])), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2])), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3])), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[1])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[3])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4])), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check cb access: word, unaligned 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) + 2), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: word, unaligned 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 1), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: word, unaligned 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 2), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: word, unaligned 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 3), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: double", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[2])), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[2])), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check cb access: double, unaligned 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[1])), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: double, unaligned 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3])), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "check cb access: double, oob 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4])), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: double, oob 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[4]) + 8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: double, oob 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0]) - 8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: double, oob 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4])), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: double, oob 5", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4]) + 8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: double, oob 6", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) - 8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "check cb access: double, wrong type", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", - .errstr_unpriv = "R1 leaks addr", .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, }, { "check out of range skb->cb access", -- cgit v1.2.3 From 2d071c643f1cd15a24172de4b5b7ae2adb93abbb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 15 Jan 2017 01:34:25 +0100 Subject: bpf, trace: make ctx access checks more robust Make sure that ctx cannot potentially be accessed oob by asserting explicitly that ctx access size into pt_regs for BPF_PROG_TYPE_KPROBE programs must be within limits. In case some 32bit archs have pt_regs not being a multiple of 8, then BPF_DW access could cause such access. BPF_PROG_TYPE_KPROBE progs don't have a ctx conversion function since there's no extra mapping needed. kprobe_prog_is_valid_access() didn't enforce sizeof(long) as the only allowed access size, since LLVM can generate non BPF_W/BPF_DW access to regs from time to time. For BPF_PROG_TYPE_TRACEPOINT we don't have a ctx conversion either, so add a BUILD_BUG_ON() check to make sure that BPF_DW access will not be a similar issue in future (ctx works on event buffer as opposed to pt_regs there). Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1860e7f1e5a8..c22a961d1a42 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -459,6 +459,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return false; if (off % size != 0) return false; + /* + * Assertion for 32 bit to make sure last 8 byte access + * (BPF_DW) to the last 4 byte member is disallowed. + */ + if (off + size > sizeof(struct pt_regs)) + return false; + return true; } @@ -540,6 +547,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (off % size != 0) return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } -- cgit v1.2.3 From a5e8c07059d0f0b31737408711d44794928ac218 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 18 Jan 2017 17:55:49 +0000 Subject: bpf: add bpf_probe_read_str helper Provide a simple helper with the same semantics of strncpy_from_unsafe(): int bpf_probe_read_str(void *dst, int size, const void *unsafe_addr) This gives more flexibility to a bpf program. A typical use case is intercepting a file name during sys_open(). The current approach is: SEC("kprobe/sys_open") void bpf_sys_open(struct pt_regs *ctx) { char buf[PATHLEN]; // PATHLEN is defined to 256 bpf_probe_read(buf, sizeof(buf), ctx->di); /* consume buf */ } This is suboptimal because the size of the string needs to be estimated at compile time, causing more memory to be copied than often necessary, and can become more problematic if further processing on buf is done, for example by pushing it to userspace via bpf_perf_event_output(), since the real length of the string is unknown and the entire buffer must be copied (and defining an unrolled strnlen() inside the bpf program is a very inefficient and unfeasible approach). With the new helper, the code can easily operate on the actual string length rather than the buffer size: SEC("kprobe/sys_open") void bpf_sys_open(struct pt_regs *ctx) { char buf[PATHLEN]; // PATHLEN is defined to 256 int res = bpf_probe_read_str(buf, sizeof(buf), ctx->di); /* consume buf, for example push it to userspace via * bpf_perf_event_output(), but this time we can use * res (the string length) as event size, after checking * its boundaries. */ } Another useful use case is when parsing individual process arguments or individual environment variables navigating current->mm->arg_start and current->mm->env_start: using this helper and the return value, one can quickly iterate at the right offset of the memory area. The code changes simply leverage the already existent strncpy_from_unsafe() kernel function, which is safe to be called from a bpf program as it is used in bpf_trace_printk(). Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 15 ++++++++++++++- kernel/trace/bpf_trace.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eb0e87dbe9f..54a5894bb4ea 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -430,6 +430,18 @@ union bpf_attr { * @xdp_md: pointer to xdp_md * @delta: An positive/negative integer to be added to xdp_md.data * Return: 0 on success or negative on error + * + * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * Copy a NUL terminated string from unsafe address. In case the string + * length is smaller than size, the target is not padded with further NUL + * bytes. In case the string length is larger than size, just count-1 + * bytes are copied and the last byte is set to NUL. + * @dst: destination address + * @size: maximum number of bytes to copy, including the trailing NUL + * @unsafe_ptr: unsafe address + * Return: + * > 0 length of the string including the trailing NUL on success + * < 0 error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -476,7 +488,8 @@ union bpf_attr { FN(set_hash_invalid), \ FN(get_numa_node_id), \ FN(skb_change_head), \ - FN(xdp_adjust_head), + FN(xdp_adjust_head), \ + FN(probe_read_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c22a961d1a42..424daa4586d1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + int ret; + + /* + * The strncpy_from_unsafe() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = strncpy_from_unsafe(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_str_proto = { + .func = bpf_probe_read_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_str: + return &bpf_probe_read_str_proto; default: return NULL; } -- cgit v1.2.3 From b95a5c4db09bc7c253636cb84dc9b12c577fd5a0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Sat, 21 Jan 2017 17:26:11 +0100 Subject: bpf: add a longest prefix match trie map implementation This trie implements a longest prefix match algorithm that can be used to match IP addresses to a stored set of ranges. Internally, data is stored in an unbalanced trie of nodes that has a maximum height of n, where n is the prefixlen the trie was created with. Tries may be created with prefix lengths that are multiples of 8, in the range from 8 to 2048. The key used for lookup and update operations is a struct bpf_lpm_trie_key, and the value is a uint64_t. The code carries more information about the internal implementation. Signed-off-by: Daniel Mack Reviewed-by: David Herrmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 + kernel/bpf/Makefile | 2 +- kernel/bpf/lpm_trie.c | 503 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 511 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/lpm_trie.c (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54a5894bb4ea..bd3068485410 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -63,6 +63,12 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -89,6 +95,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474ac3cd..e1ce4f4fd7fd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 000000000000..ba19241d1979 --- /dev/null +++ b/kernel/bpf/lpm_trie.c @@ -0,0 +1,503 @@ +/* + * Longest prefix match list implementation + * + * Copyright (c) 2016,2017 Daniel Mack + * Copyright (c) 2016 David Herrmann + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include +#include +#include +#include +#include +#include + +/* Intermediate node */ +#define LPM_TREE_NODE_FLAG_IM BIT(0) + +struct lpm_trie_node; + +struct lpm_trie_node { + struct rcu_head rcu; + struct lpm_trie_node __rcu *child[2]; + u32 prefixlen; + u32 flags; + u8 data[0]; +}; + +struct lpm_trie { + struct bpf_map map; + struct lpm_trie_node __rcu *root; + size_t n_entries; + size_t max_prefixlen; + size_t data_size; + raw_spinlock_t lock; +}; + +/* This trie implements a longest prefix match algorithm that can be used to + * match IP addresses to a stored set of ranges. + * + * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is + * interpreted as big endian, so data[0] stores the most significant byte. + * + * Match ranges are internally stored in instances of struct lpm_trie_node + * which each contain their prefix length as well as two pointers that may + * lead to more nodes containing more specific matches. Each node also stores + * a value that is defined by and returned to userspace via the update_elem + * and lookup functions. + * + * For instance, let's start with a trie that was created with a prefix length + * of 32, so it can be used for IPv4 addresses, and one single element that + * matches 192.168.0.0/16. The data array would hence contain + * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will + * stick to IP-address notation for readability though. + * + * As the trie is empty initially, the new node (1) will be places as root + * node, denoted as (R) in the example below. As there are no other node, both + * child pointers are %NULL. + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * + * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already + * a node with the same data and a smaller prefix (ie, a less specific one), + * node (2) will become a child of (1). In child index depends on the next bit + * that is outside of what (1) matches, and that bit is 0, so (2) will be + * child[0] of (1): + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | + * +----------------+ + * | (2) | + * | 192.168.0.0/24 | + * | value: 2 | + * | [0] [1] | + * +----------------+ + * + * The child[1] slot of (1) could be filled with another node which has bit #17 + * (the next bit after the ones that (1) matches on) set to 1. For instance, + * 192.168.128.0/24: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (2) | | (3) | + * | 192.168.0.0/24 | | 192.168.128.0/24 | + * | value: 2 | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * + * Let's add another node (4) to the game for 192.168.1.0/24. In order to place + * it, node (1) is looked at first, and because (4) of the semantics laid out + * above (bit #17 is 0), it would normally be attached to (1) as child[0]. + * However, that slot is already allocated, so a new node is needed in between. + * That node does not have a value attached to it and it will never be + * returned to users as result of a lookup. It is only there to differentiate + * the traversal further. It will get a prefix as wide as necessary to + * distinguish its two children: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (4) (I) | | (3) | + * | 192.168.0.0/23 | | 192.168.128.0/24 | + * | value: --- | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * | | + * +----------------+ +----------------+ + * | (2) | | (5) | + * | 192.168.0.0/24 | | 192.168.1.0/24 | + * | value: 2 | | value: 5 | + * | [0] [1] | | [0] [1] | + * +----------------+ +----------------+ + * + * 192.168.1.1/32 would be a child of (5) etc. + * + * An intermediate node will be turned into a 'real' node on demand. In the + * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. + * + * A fully populated trie would have a height of 32 nodes, as the trie was + * created with a prefix length of 32. + * + * The lookup starts at the root node. If the current node matches and if there + * is a child that can be used to become more specific, the trie is traversed + * downwards. The last node in the traversal that is a non-intermediate one is + * returned. + */ + +static inline int extract_bit(const u8 *data, size_t index) +{ + return !!(data[index / 8] & (1 << (7 - (index % 8)))); +} + +/** + * longest_prefix_match() - determine the longest prefix + * @trie: The trie to get internal sizes from + * @node: The node to operate on + * @key: The key to compare to @node + * + * Determine the longest prefix of @node that matches the bits in @key. + */ +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key *key) +{ + size_t prefixlen = 0; + size_t i; + + for (i = 0; i < trie->data_size; i++) { + size_t b; + + b = 8 - fls(node->data[i] ^ key->data[i]); + prefixlen += b; + + if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) + return min(node->prefixlen, key->prefixlen); + + if (b < 8) + break; + } + + return prefixlen; +} + +/* Called from syscall or from eBPF program */ +static void *trie_lookup_elem(struct bpf_map *map, void *_key) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *found = NULL; + struct bpf_lpm_trie_key *key = _key; + + /* Start walking the trie from the root node ... */ + + for (node = rcu_dereference(trie->root); node;) { + unsigned int next_bit; + size_t matchlen; + + /* Determine the longest prefix of @node that matches @key. + * If it's the maximum possible prefix for this trie, we have + * an exact match and can return it directly. + */ + matchlen = longest_prefix_match(trie, node, key); + if (matchlen == trie->max_prefixlen) { + found = node; + break; + } + + /* If the number of bits that match is smaller than the prefix + * length of @node, bail out and return the node we have seen + * last in the traversal (ie, the parent). + */ + if (matchlen < node->prefixlen) + break; + + /* Consider this node as return candidate unless it is an + * artificially added intermediate one. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + found = node; + + /* If the node match is fully satisfied, let's see if we can + * become more specific. Determine the next bit in the key and + * traverse down. + */ + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + + if (!found) + return NULL; + + return found->data + trie->data_size; +} + +static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, + const void *value) +{ + struct lpm_trie_node *node; + size_t size = sizeof(struct lpm_trie_node) + trie->data_size; + + if (value) + size += trie->map.value_size; + + node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + node->flags = 0; + + if (value) + memcpy(node->data + trie->data_size, value, + trie->map.value_size); + + return node; +} + +/* Called from syscall or from eBPF program */ +static int trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *im_node, *new_node = NULL; + struct lpm_trie_node __rcu **slot; + struct bpf_lpm_trie_key *key = _key; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Allocate and fill a new node */ + + if (trie->n_entries == trie->map.max_entries) { + ret = -ENOSPC; + goto out; + } + + new_node = lpm_trie_node_alloc(trie, value); + if (!new_node) { + ret = -ENOMEM; + goto out; + } + + trie->n_entries++; + + new_node->prefixlen = key->prefixlen; + RCU_INIT_POINTER(new_node->child[0], NULL); + RCU_INIT_POINTER(new_node->child[1], NULL); + memcpy(new_node->data, key->data, trie->data_size); + + /* Now find a slot to attach the new node. To do that, walk the tree + * from the root and match as many bits as possible for each node until + * we either find an empty slot or a slot that needs to be replaced by + * an intermediate node. + */ + slot = &trie->root; + + while ((node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen || + node->prefixlen == trie->max_prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + slot = &node->child[next_bit]; + } + + /* If the slot is empty (a free child pointer or an empty root), + * simply assign the @new_node to that slot and be done. + */ + if (!node) { + rcu_assign_pointer(*slot, new_node); + goto out; + } + + /* If the slot we picked already exists, replace it with @new_node + * which already has the correct data array set. + */ + if (node->prefixlen == matchlen) { + new_node->child[0] = node->child[0]; + new_node->child[1] = node->child[1]; + + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + trie->n_entries--; + + rcu_assign_pointer(*slot, new_node); + kfree_rcu(node, rcu); + + goto out; + } + + /* If the new node matches the prefix completely, it must be inserted + * as an ancestor. Simply insert it between @node and *@slot. + */ + if (matchlen == key->prefixlen) { + next_bit = extract_bit(node->data, matchlen); + rcu_assign_pointer(new_node->child[next_bit], node); + rcu_assign_pointer(*slot, new_node); + goto out; + } + + im_node = lpm_trie_node_alloc(trie, NULL); + if (!im_node) { + ret = -ENOMEM; + goto out; + } + + im_node->prefixlen = matchlen; + im_node->flags |= LPM_TREE_NODE_FLAG_IM; + memcpy(im_node->data, node->data, trie->data_size); + + /* Now determine which child to install in which slot */ + if (extract_bit(key->data, matchlen)) { + rcu_assign_pointer(im_node->child[0], node); + rcu_assign_pointer(im_node->child[1], new_node); + } else { + rcu_assign_pointer(im_node->child[0], new_node); + rcu_assign_pointer(im_node->child[1], node); + } + + /* Finally, assign the intermediate node to the determined spot */ + rcu_assign_pointer(*slot, im_node); + +out: + if (ret) { + if (new_node) + trie->n_entries--; + + kfree(new_node); + kfree(im_node); + } + + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; +} + +static int trie_delete_elem(struct bpf_map *map, void *key) +{ + /* TODO */ + return -ENOSYS; +} + +static struct bpf_map *trie_alloc(union bpf_attr *attr) +{ + size_t cost, cost_per_node; + struct lpm_trie *trie; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || + attr->map_flags != BPF_F_NO_PREALLOC || + attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || + attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + if (!trie) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->data_size = attr->key_size - + offsetof(struct bpf_lpm_trie_key, data); + trie->max_prefixlen = trie->data_size * 8; + + cost_per_node = sizeof(struct lpm_trie_node) + + attr->value_size + trie->data_size; + cost = sizeof(*trie) + attr->max_entries * cost_per_node; + trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(trie->map.pages); + if (ret) { + kfree(trie); + return ERR_PTR(ret); + } + + raw_spin_lock_init(&trie->lock); + + return &trie->map; +} + +static void trie_free(struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + + raw_spin_lock(&trie->lock); + + /* Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its reference in the parent + * and start over. + */ + + for (;;) { + slot = &trie->root; + + for (;;) { + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) + goto unlock; + + if (rcu_access_pointer(node->child[0])) { + slot = &node->child[0]; + continue; + } + + if (rcu_access_pointer(node->child[1])) { + slot = &node->child[1]; + continue; + } + + kfree(node); + RCU_INIT_POINTER(*slot, NULL); + break; + } + } + +unlock: + raw_spin_unlock(&trie->lock); +} + +static const struct bpf_map_ops trie_ops = { + .map_alloc = trie_alloc, + .map_free = trie_free, + .map_lookup_elem = trie_lookup_elem, + .map_update_elem = trie_update_elem, + .map_delete_elem = trie_delete_elem, +}; + +static struct bpf_map_type_list trie_type __read_mostly = { + .ops = &trie_ops, + .type = BPF_MAP_TYPE_LPM_TRIE, +}; + +static int __init register_trie_map(void) +{ + bpf_register_map_type(&trie_type); + return 0; +} +late_initcall(register_trie_map); -- cgit v1.2.3 From d140199af510ad4749dc5e38b7922135258ba5fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Jan 2017 01:26:46 +0100 Subject: bpf, lpm: fix kfree of im_node in trie_update_elem We need to initialize im_node to NULL, otherwise in case of error path it gets passed to kfree() as uninitialized pointer. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index ba19241d1979..144e9763102f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -262,7 +262,7 @@ static int trie_update_elem(struct bpf_map *map, void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct lpm_trie_node *node, *im_node, *new_node = NULL; + struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key *key = _key; unsigned long irq_flags; -- cgit v1.2.3 From 3fadc80115837b86f989d17c4aa92bb5cb7bc1b6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Jan 2017 01:06:30 +0100 Subject: bpf: enable verifier to better track const alu ops William reported couple of issues in relation to direct packet access. Typical scheme is to check for data + [off] <= data_end, where [off] can be either immediate or coming from a tracked register that contains an immediate, depending on the branch, we can then access the data. However, in case of calculating [off] for either the mentioned test itself or for access after the test in a more "complex" way, then the verifier will stop tracking the CONST_IMM marked register and will mark it as UNKNOWN_VALUE one. Adding that UNKNOWN_VALUE typed register to a pkt() marked register, the verifier then bails out in check_packet_ptr_add() as it finds the registers imm value below 48. In the first below example, that is due to evaluate_reg_imm_alu() not handling right shifts and thus marking the register as UNKNOWN_VALUE via helper __mark_reg_unknown_value() that resets imm to 0. In the second case the same happens at the time when r4 is set to r4 &= r5, where it transitions to UNKNOWN_VALUE from evaluate_reg_imm_alu(). Later on r4 we shift right by 3 inside evaluate_reg_alu(), where the register's imm turns into 3. That is, for registers with type UNKNOWN_VALUE, imm of 0 means that we don't know what value the register has, and for imm > 0 it means that the value has [imm] upper zero bits. F.e. when shifting an UNKNOWN_VALUE register by 3 to the right, no matter what value it had, we know that the 3 upper most bits must be zero now. This is to make sure that ALU operations with unknown registers don't overflow. Meaning, once we know that we have more than 48 upper zero bits, or, in other words cannot go beyond 0xffff offset with ALU ops, such an addition will track the target register as a new pkt() register with a new id, but 0 offset and 0 range, so for that a new data/data_end test will be required. Is the source register a CONST_IMM one that is to be added to the pkt() register, or the source instruction is an add instruction with immediate value, then it will get added if it stays within max 0xffff bounds. >From there, pkt() type, can be accessed should reg->off + imm be within the access range of pkt(). [...] from 28 to 30: R0=imm1,min_value=1,max_value=1 R1=pkt(id=0,off=0,r=22) R2=pkt_end R3=imm144,min_value=144,max_value=144 R4=imm0,min_value=0,max_value=0 R5=inv48,min_value=2054,max_value=2054 R10=fp 30: (bf) r5 = r3 31: (07) r5 += 23 32: (77) r5 >>= 3 33: (bf) r6 = r1 34: (0f) r6 += r5 cannot add integer value with 0 upper zero bits to ptr_to_packet [...] from 52 to 80: R0=imm1,min_value=1,max_value=1 R1=pkt(id=0,off=0,r=34) R2=pkt_end R3=inv R4=imm272 R5=inv56,min_value=17,max_value=17 R6=pkt(id=0,off=26,r=34) R10=fp 80: (07) r4 += 71 81: (18) r5 = 0xfffffff8 83: (5f) r4 &= r5 84: (77) r4 >>= 3 85: (0f) r1 += r4 cannot add integer value with 3 upper zero bits to ptr_to_packet Thus to get above use-cases working, evaluate_reg_imm_alu() has been extended for further ALU ops. This is fine, because we only operate strictly within realm of CONST_IMM types, so here we don't care about overflows as they will happen in the simulated but also real execution and interaction with pkt() in check_packet_ptr_add() will check actual imm value once added to pkt(), but it's irrelevant before. With regards to 06c1c049721a ("bpf: allow helpers access to variable memory") that works on UNKNOWN_VALUE registers, the verifier becomes now a bit smarter as it can better resolve ALU ops, so we need to adapt two test cases there, as min/max bound tracking only becomes necessary when registers were spilled to stack. So while mask was set before to track upper bound for UNKNOWN_VALUE case, it's now resolved directly as CONST_IMM, and such contructs are only necessary when f.e. registers are spilled. For commit 6b17387307ba ("bpf: recognize 64bit immediate loads as consts") that initially enabled dw load tracking only for nfp jit/ analyzer, I did couple of tests on large, complex programs and we don't increase complexity badly (my tests were in ~3% range on avg). I've added a couple of tests similar to affected code above, and it works fine with verifier now. Reported-by: William Tu Signed-off-by: Daniel Borkmann Cc: Gianluca Borello Cc: William Tu Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 64 +++++++++++++++------- tools/testing/selftests/bpf/test_verifier.c | 82 +++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8f69df7e8167..fb3513b35c0b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1566,22 +1566,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); + u64 dst_imm = dst_reg->imm; - /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' - * insn. Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here. Simulate execution of insns + * containing ALU ops. Don't care about overflow or negative + * values, just add/sub/... them; registers are in u64. */ - if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm += insn->imm; - else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm += src_reg->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm |= insn->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm |= src_reg->imm; - else + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { + dst_imm += insn->imm; + } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm += src_reg->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { + dst_imm -= insn->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm -= src_reg->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { + dst_imm *= insn->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm *= src_reg->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { + dst_imm |= insn->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm |= src_reg->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { + dst_imm &= insn->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm &= src_reg->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm >>= insn->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm >>= src_reg->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm <<= insn->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm <<= src_reg->imm; + } else { mark_reg_unknown_value(regs, insn->dst_reg); + goto out; + } + + dst_reg->imm = dst_imm; +out: return 0; } @@ -2225,14 +2257,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (insn->src_reg == 0) { - /* generic move 64-bit immediate into a register, - * only analyzer needs to collect the ld_imm value. - */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - if (!env->analyzer_ops) - return 0; - regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; return 0; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1aa73241c999..0d0912c7f03c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2325,6 +2325,84 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "direct packet access: test11 (shift, good access)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8), + BPF_MOV64_IMM(BPF_REG_3, 144), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_5, 3), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "direct packet access: test12 (and, good access)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8), + BPF_MOV64_IMM(BPF_REG_3, 144), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23), + BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 15), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "direct packet access: test13 (branches, good access)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 13), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_MOV64_IMM(BPF_REG_4, 1), + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_4, 2), + BPF_MOV64_IMM(BPF_REG_3, 14), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, 24), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23), + BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 15), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, { "helper access to packet: test1, valid packet_ptr range", .insns = { @@ -4208,6 +4286,8 @@ static struct bpf_test tests[] = { .insns = { BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), BPF_MOV64_IMM(BPF_REG_3, 0), BPF_MOV64_IMM(BPF_REG_4, 0), @@ -4251,6 +4331,8 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), -- cgit v1.2.3 From 2acae0d5b0f73a8fb4b180bd13491feb96e55fc6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 25 Jan 2017 02:28:16 +0100 Subject: trace: add variant without spacing in trace_print_hex_seq For upcoming tracepoint support for BPF, we want to dump the program's tag. Format should be similar to __print_hex(), but without spacing. Add a __print_hex_str() variant for exactly that purpose that reuses trace_print_hex_seq(). Signed-off-by: Daniel Borkmann Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/trace_events.h | 3 ++- include/trace/trace_events.h | 8 +++++++- kernel/trace/trace_output.c | 7 ++++--- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index be007610ceb0..cfa475a0e9ca 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -33,7 +33,8 @@ const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size); const char *trace_print_hex_seq(struct trace_seq *p, - const unsigned char *buf, int len); + const unsigned char *buf, int len, + bool spacing); const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 467e12f780d8..9f684629c3cf 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -297,7 +297,12 @@ TRACE_MAKE_SYSTEM_STR(); #endif #undef __print_hex -#define __print_hex(buf, buf_len) trace_print_hex_seq(p, buf, buf_len) +#define __print_hex(buf, buf_len) \ + trace_print_hex_seq(p, buf, buf_len, true) + +#undef __print_hex_str +#define __print_hex_str(buf, buf_len) \ + trace_print_hex_seq(p, buf, buf_len, false) #undef __print_array #define __print_array(array, count, el_size) \ @@ -711,6 +716,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_flags #undef __print_symbolic #undef __print_hex +#undef __print_hex_str #undef __get_dynamic_array #undef __get_dynamic_array_len #undef __get_str diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5d33a7352919..30a144b1b9ee 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -163,14 +163,15 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); const char * -trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, + bool spacing) { int i; const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); - + trace_seq_printf(p, "%s%2.2x", !spacing || i == 0 ? "" : " ", + buf[i]); trace_seq_putc(p, 0); return ret; -- cgit v1.2.3 From a67edbf4fb6deadcfe57a04a134abed4a5ba3bb5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 25 Jan 2017 02:28:18 +0100 Subject: bpf: add initial bpf tracepoints This work adds a number of tracepoints to paths that are either considered slow-path or exception-like states, where monitoring or inspecting them would be desirable. For bpf(2) syscall, tracepoints have been placed for main commands when they succeed. In XDP case, tracepoint is for exceptions, that is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED return code, or when error occurs during XDP_TX action and the packet could not be forwarded. Both have been split into separate event headers, and can be further extended. Worst case, if they unexpectedly should get into our way in future, they can also removed [1]. Of course, these tracepoints (like any other) can be analyzed by eBPF itself, etc. Example output: # ./perf record -a -e bpf:* sleep 10 # ./perf script sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0 sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5 sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00] [...] sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00] swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER [1] https://lwn.net/Articles/705270/ Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 12 +- .../net/ethernet/netronome/nfp/nfp_net_common.c | 15 +- drivers/net/ethernet/qlogic/qede/qede_fp.c | 4 + drivers/net/virtio_net.c | 12 +- include/linux/bpf_trace.h | 7 + include/trace/events/bpf.h | 347 +++++++++++++++++++++ include/trace/events/xdp.h | 53 ++++ kernel/bpf/core.c | 9 + kernel/bpf/inode.c | 17 +- kernel/bpf/syscall.c | 19 +- 11 files changed, 483 insertions(+), 15 deletions(-) create mode 100644 include/linux/bpf_trace.h create mode 100644 include/trace/events/bpf.h create mode 100644 include/trace/events/xdp.h (limited to 'kernel') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index e362f99334d0..f15ddba3659a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -926,10 +927,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud length, cq->ring, &doorbell_pending))) goto consumed; + trace_xdp_exception(dev, xdp_prog, act); goto xdp_drop_no_cnt; /* Drop on xmit failure */ default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); case XDP_DROP: ring->xdp_drop++; xdp_drop_no_cnt: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index ba50583ea3ed..3d2e1a1886a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "en.h" #include "en_tc.h" @@ -640,7 +641,7 @@ static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_sq *sq) mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0); } -static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq, +static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, const struct xdp_buff *xdp) { @@ -662,7 +663,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq, MLX5E_SW2HW_MTU(rq->netdev->mtu) < dma_len)) { rq->stats.xdp_drop++; mlx5e_page_release(rq, di, true); - return; + return false; } if (unlikely(!mlx5e_sq_has_room_for(sq, MLX5E_XDP_TX_WQEBBS))) { @@ -673,7 +674,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq, } rq->stats.xdp_tx_full++; mlx5e_page_release(rq, di, true); - return; + return false; } dma_len -= MLX5E_XDP_MIN_INLINE; @@ -703,6 +704,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq, sq->db.xdp.doorbell = true; rq->stats.xdp_tx++; + return true; } /* returns true if packet was consumed by xdp */ @@ -728,11 +730,13 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, *len = xdp.data_end - xdp.data; return false; case XDP_TX: - mlx5e_xmit_xdp_frame(rq, di, &xdp); + if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp))) + trace_xdp_exception(rq->netdev, prog, act); return true; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: + trace_xdp_exception(rq->netdev, prog, act); case XDP_DROP: rq->stats.xdp_drop++; mlx5e_page_release(rq, di, true); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 67afd95ffb93..6ac43abf561b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -42,6 +42,7 @@ */ #include +#include #include #include #include @@ -1459,7 +1460,7 @@ nfp_net_rx_drop(struct nfp_net_r_vector *r_vec, struct nfp_net_rx_ring *rx_ring, dev_kfree_skb_any(skb); } -static void +static bool nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring, struct nfp_net_tx_ring *tx_ring, struct nfp_net_rx_buf *rxbuf, unsigned int pkt_off, @@ -1473,13 +1474,13 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring, if (unlikely(nfp_net_tx_full(tx_ring, 1))) { nfp_net_rx_drop(rx_ring->r_vec, rx_ring, rxbuf, NULL); - return; + return false; } new_frag = nfp_net_napi_alloc_one(nn, DMA_BIDIRECTIONAL, &new_dma_addr); if (unlikely(!new_frag)) { nfp_net_rx_drop(rx_ring->r_vec, rx_ring, rxbuf, NULL); - return; + return false; } nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr); @@ -1509,6 +1510,7 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring, tx_ring->wr_p++; tx_ring->wr_ptr_add++; + return true; } static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len) @@ -1613,12 +1615,15 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) case XDP_PASS: break; case XDP_TX: - nfp_net_tx_xdp_buf(nn, rx_ring, tx_ring, rxbuf, - pkt_off, pkt_len); + if (unlikely(!nfp_net_tx_xdp_buf(nn, rx_ring, + tx_ring, rxbuf, + pkt_off, pkt_len))) + trace_xdp_exception(nn->netdev, xdp_prog, act); continue; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: + trace_xdp_exception(nn->netdev, xdp_prog, act); case XDP_DROP: nfp_net_rx_give_one(rx_ring, rxbuf->frag, rxbuf->dma_addr); diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 1a6ca4884fad..445d4d2492c3 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -1016,6 +1017,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, /* We need the replacement buffer before transmit. */ if (qede_alloc_rx_buffer(rxq, true)) { qede_recycle_rx_bd_ring(rxq, 1); + trace_xdp_exception(edev->ndev, prog, act); return false; } @@ -1026,6 +1028,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, dma_unmap_page(rxq->dev, bd->mapping, PAGE_SIZE, DMA_BIDIRECTIONAL); __free_page(bd->data); + trace_xdp_exception(edev->ndev, prog, act); } /* Regardless, we've consumed an Rx BD */ @@ -1035,6 +1038,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: + trace_xdp_exception(edev->ndev, prog, act); case XDP_DROP: qede_recycle_rx_bd_ring(rxq, cqe->bd_num); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 37db91d1a0a3..f9bf94887ff1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -330,7 +331,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, return skb; } -static void virtnet_xdp_xmit(struct virtnet_info *vi, +static bool virtnet_xdp_xmit(struct virtnet_info *vi, struct receive_queue *rq, struct send_queue *sq, struct xdp_buff *xdp, @@ -382,10 +383,12 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi, put_page(page); } else /* small buffer */ kfree_skb(data); - return; // On error abort to avoid unnecessary kick + /* On error abort to avoid unnecessary kick */ + return false; } virtqueue_kick(sq->vq); + return true; } static u32 do_xdp_prog(struct virtnet_info *vi, @@ -421,11 +424,14 @@ static u32 do_xdp_prog(struct virtnet_info *vi, vi->xdp_queue_pairs + smp_processor_id(); xdp.data = buf; - virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data); + if (unlikely(!virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, + data))) + trace_xdp_exception(vi->dev, xdp_prog, act); return XDP_TX; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: + trace_xdp_exception(vi->dev, xdp_prog, act); case XDP_DROP: return XDP_DROP; } diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h new file mode 100644 index 000000000000..b22efbdd2eb4 --- /dev/null +++ b/include/linux/bpf_trace.h @@ -0,0 +1,7 @@ +#ifndef __LINUX_BPF_TRACE_H__ +#define __LINUX_BPF_TRACE_H__ + +#include +#include + +#endif /* __LINUX_BPF_TRACE_H__ */ diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h new file mode 100644 index 000000000000..c3a53fd47ff1 --- /dev/null +++ b/include/trace/events/bpf.h @@ -0,0 +1,347 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf + +#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BPF_H + +#include +#include +#include +#include + +#define __PROG_TYPE_MAP(FN) \ + FN(SOCKET_FILTER) \ + FN(KPROBE) \ + FN(SCHED_CLS) \ + FN(SCHED_ACT) \ + FN(TRACEPOINT) \ + FN(XDP) \ + FN(PERF_EVENT) \ + FN(CGROUP_SKB) \ + FN(CGROUP_SOCK) \ + FN(LWT_IN) \ + FN(LWT_OUT) \ + FN(LWT_XMIT) + +#define __MAP_TYPE_MAP(FN) \ + FN(HASH) \ + FN(ARRAY) \ + FN(PROG_ARRAY) \ + FN(PERF_EVENT_ARRAY) \ + FN(PERCPU_HASH) \ + FN(PERCPU_ARRAY) \ + FN(STACK_TRACE) \ + FN(CGROUP_ARRAY) \ + FN(LRU_HASH) \ + FN(LRU_PERCPU_HASH) \ + FN(LPM_TRIE) + +#define __PROG_TYPE_TP_FN(x) \ + TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x); +#define __PROG_TYPE_SYM_FN(x) \ + { BPF_PROG_TYPE_##x, #x }, +#define __PROG_TYPE_SYM_TAB \ + __PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 } +__PROG_TYPE_MAP(__PROG_TYPE_TP_FN) + +#define __MAP_TYPE_TP_FN(x) \ + TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x); +#define __MAP_TYPE_SYM_FN(x) \ + { BPF_MAP_TYPE_##x, #x }, +#define __MAP_TYPE_SYM_TAB \ + __MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 } +__MAP_TYPE_MAP(__MAP_TYPE_TP_FN) + +DECLARE_EVENT_CLASS(bpf_prog_event, + + TP_PROTO(const struct bpf_prog *prg), + + TP_ARGS(prg), + + TP_STRUCT__entry( + __array(u8, prog_tag, 8) + __field(u32, type) + ), + + TP_fast_assign( + BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); + memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); + __entry->type = prg->type; + ), + + TP_printk("prog=%s type=%s", + __print_hex_str(__entry->prog_tag, 8), + __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB)) +); + +DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type, + + TP_PROTO(const struct bpf_prog *prg), + + TP_ARGS(prg) +); + +DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu, + + TP_PROTO(const struct bpf_prog *prg), + + TP_ARGS(prg) +); + +TRACE_EVENT(bpf_prog_load, + + TP_PROTO(const struct bpf_prog *prg, int ufd), + + TP_ARGS(prg, ufd), + + TP_STRUCT__entry( + __array(u8, prog_tag, 8) + __field(u32, type) + __field(int, ufd) + ), + + TP_fast_assign( + BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); + memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); + __entry->type = prg->type; + __entry->ufd = ufd; + ), + + TP_printk("prog=%s type=%s ufd=%d", + __print_hex_str(__entry->prog_tag, 8), + __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB), + __entry->ufd) +); + +TRACE_EVENT(bpf_map_create, + + TP_PROTO(const struct bpf_map *map, int ufd), + + TP_ARGS(map, ufd), + + TP_STRUCT__entry( + __field(u32, type) + __field(u32, size_key) + __field(u32, size_value) + __field(u32, max_entries) + __field(u32, flags) + __field(int, ufd) + ), + + TP_fast_assign( + __entry->type = map->map_type; + __entry->size_key = map->key_size; + __entry->size_value = map->value_size; + __entry->max_entries = map->max_entries; + __entry->flags = map->map_flags; + __entry->ufd = ufd; + ), + + TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x", + __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), + __entry->ufd, __entry->size_key, __entry->size_value, + __entry->max_entries, __entry->flags) +); + +DECLARE_EVENT_CLASS(bpf_obj_prog, + + TP_PROTO(const struct bpf_prog *prg, int ufd, + const struct filename *pname), + + TP_ARGS(prg, ufd, pname), + + TP_STRUCT__entry( + __array(u8, prog_tag, 8) + __field(int, ufd) + __string(path, pname->name) + ), + + TP_fast_assign( + BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); + memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); + __assign_str(path, pname->name); + __entry->ufd = ufd; + ), + + TP_printk("prog=%s path=%s ufd=%d", + __print_hex_str(__entry->prog_tag, 8), + __get_str(path), __entry->ufd) +); + +DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog, + + TP_PROTO(const struct bpf_prog *prg, int ufd, + const struct filename *pname), + + TP_ARGS(prg, ufd, pname) +); + +DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog, + + TP_PROTO(const struct bpf_prog *prg, int ufd, + const struct filename *pname), + + TP_ARGS(prg, ufd, pname) +); + +DECLARE_EVENT_CLASS(bpf_obj_map, + + TP_PROTO(const struct bpf_map *map, int ufd, + const struct filename *pname), + + TP_ARGS(map, ufd, pname), + + TP_STRUCT__entry( + __field(u32, type) + __field(int, ufd) + __string(path, pname->name) + ), + + TP_fast_assign( + __assign_str(path, pname->name); + __entry->type = map->map_type; + __entry->ufd = ufd; + ), + + TP_printk("map type=%s ufd=%d path=%s", + __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), + __entry->ufd, __get_str(path)) +); + +DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map, + + TP_PROTO(const struct bpf_map *map, int ufd, + const struct filename *pname), + + TP_ARGS(map, ufd, pname) +); + +DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map, + + TP_PROTO(const struct bpf_map *map, int ufd, + const struct filename *pname), + + TP_ARGS(map, ufd, pname) +); + +DECLARE_EVENT_CLASS(bpf_map_keyval, + + TP_PROTO(const struct bpf_map *map, int ufd, + const void *key, const void *val), + + TP_ARGS(map, ufd, key, val), + + TP_STRUCT__entry( + __field(u32, type) + __field(u32, key_len) + __dynamic_array(u8, key, map->key_size) + __field(bool, key_trunc) + __field(u32, val_len) + __dynamic_array(u8, val, map->value_size) + __field(bool, val_trunc) + __field(int, ufd) + ), + + TP_fast_assign( + memcpy(__get_dynamic_array(key), key, map->key_size); + memcpy(__get_dynamic_array(val), val, map->value_size); + __entry->type = map->map_type; + __entry->key_len = min(map->key_size, 16U); + __entry->key_trunc = map->key_size != __entry->key_len; + __entry->val_len = min(map->value_size, 16U); + __entry->val_trunc = map->value_size != __entry->val_len; + __entry->ufd = ufd; + ), + + TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]", + __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), + __entry->ufd, + __print_hex(__get_dynamic_array(key), __entry->key_len), + __entry->key_trunc ? " ..." : "", + __print_hex(__get_dynamic_array(val), __entry->val_len), + __entry->val_trunc ? " ..." : "") +); + +DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem, + + TP_PROTO(const struct bpf_map *map, int ufd, + const void *key, const void *val), + + TP_ARGS(map, ufd, key, val) +); + +DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem, + + TP_PROTO(const struct bpf_map *map, int ufd, + const void *key, const void *val), + + TP_ARGS(map, ufd, key, val) +); + +TRACE_EVENT(bpf_map_delete_elem, + + TP_PROTO(const struct bpf_map *map, int ufd, + const void *key), + + TP_ARGS(map, ufd, key), + + TP_STRUCT__entry( + __field(u32, type) + __field(u32, key_len) + __dynamic_array(u8, key, map->key_size) + __field(bool, key_trunc) + __field(int, ufd) + ), + + TP_fast_assign( + memcpy(__get_dynamic_array(key), key, map->key_size); + __entry->type = map->map_type; + __entry->key_len = min(map->key_size, 16U); + __entry->key_trunc = map->key_size != __entry->key_len; + __entry->ufd = ufd; + ), + + TP_printk("map type=%s ufd=%d key=[%s%s]", + __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), + __entry->ufd, + __print_hex(__get_dynamic_array(key), __entry->key_len), + __entry->key_trunc ? " ..." : "") +); + +TRACE_EVENT(bpf_map_next_key, + + TP_PROTO(const struct bpf_map *map, int ufd, + const void *key, const void *key_next), + + TP_ARGS(map, ufd, key, key_next), + + TP_STRUCT__entry( + __field(u32, type) + __field(u32, key_len) + __dynamic_array(u8, key, map->key_size) + __dynamic_array(u8, nxt, map->key_size) + __field(bool, key_trunc) + __field(int, ufd) + ), + + TP_fast_assign( + memcpy(__get_dynamic_array(key), key, map->key_size); + memcpy(__get_dynamic_array(nxt), key_next, map->key_size); + __entry->type = map->map_type; + __entry->key_len = min(map->key_size, 16U); + __entry->key_trunc = map->key_size != __entry->key_len; + __entry->ufd = ufd; + ), + + TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]", + __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), + __entry->ufd, + __print_hex(__get_dynamic_array(key), __entry->key_len), + __entry->key_trunc ? " ..." : "", + __print_hex(__get_dynamic_array(nxt), __entry->key_len), + __entry->key_trunc ? " ..." : "") +); + +#endif /* _TRACE_BPF_H */ + +#include diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h new file mode 100644 index 000000000000..1b61357d3f57 --- /dev/null +++ b/include/trace/events/xdp.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xdp + +#if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XDP_H + +#include +#include +#include + +#define __XDP_ACT_MAP(FN) \ + FN(ABORTED) \ + FN(DROP) \ + FN(PASS) \ + FN(TX) + +#define __XDP_ACT_TP_FN(x) \ + TRACE_DEFINE_ENUM(XDP_##x); +#define __XDP_ACT_SYM_FN(x) \ + { XDP_##x, #x }, +#define __XDP_ACT_SYM_TAB \ + __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 } +__XDP_ACT_MAP(__XDP_ACT_TP_FN) + +TRACE_EVENT(xdp_exception, + + TP_PROTO(const struct net_device *dev, + const struct bpf_prog *xdp, u32 act), + + TP_ARGS(dev, xdp, act), + + TP_STRUCT__entry( + __string(name, dev->name) + __array(u8, prog_tag, 8) + __field(u32, act) + ), + + TP_fast_assign( + BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag)); + memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag)); + __assign_str(name, dev->name); + __entry->act = act; + ), + + TP_printk("prog=%s device=%s action=%s", + __print_hex_str(__entry->prog_tag, 8), + __get_str(name), + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB)) +); + +#endif /* _TRACE_XDP_H */ + +#include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 503d4211988a..fddd76b1b627 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1173,3 +1173,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, { return -EFAULT; } + +/* All definitions of tracepoints related to BPF. */ +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0b030c9126d3..fddcae801724 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,7 @@ #include #include #include +#include enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); + if ((trace_bpf_obj_pin_prog_enabled() || + trace_bpf_obj_pin_map_enabled()) && !ret) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_pin_prog(raw, ufd, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_pin_map(raw, ufd, pname); + } out: putname(pname); return ret; @@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname) else goto out; - if (ret < 0) + if (ret < 0) { bpf_any_put(raw, type); + } else if (trace_bpf_obj_get_prog_enabled() || + trace_bpf_obj_get_map_enabled()) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_get_prog(raw, ret, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_get_map(raw, ret, pname); + } out: putname(pname); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1d6b29e4e2c3..05ad086ab71d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -10,6 +10,7 @@ * General Public License for more details. */ #include +#include #include #include #include @@ -215,6 +216,7 @@ static int map_create(union bpf_attr *attr) /* failed to allocate fd */ goto free_map; + trace_bpf_map_create(map, err); return err; free_map: @@ -339,6 +341,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; + trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -421,6 +424,8 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -466,6 +471,8 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_delete_elem(map, ufd, key); free_key: kfree(key); err_put: @@ -518,6 +525,7 @@ static int map_get_next_key(union bpf_attr *attr) if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; + trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -671,8 +679,10 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) + if (atomic_dec_and_test(&prog->aux->refcnt)) { + trace_bpf_prog_put_rcu(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -781,7 +791,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - return __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get_type); @@ -863,6 +877,7 @@ static int bpf_prog_load(union bpf_attr *attr) /* failed to allocate fd */ goto free_used_maps; + trace_bpf_prog_load(prog, err); return err; free_used_maps: -- cgit v1.2.3 From 3898fac1f488c76e0eef5b5267b4ba8112a82ac4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 2 Feb 2017 17:09:54 +0100 Subject: trace: rename trace_print_hex_seq arg and add kdoc Steven suggested to improve trace_print_hex_seq() a bit after commit 2acae0d5b0f7 ("trace: add variant without spacing in trace_print_hex_seq") in two ways: i) by adding a kdoc comment for the helper function itself and ii) by renaming 'spacing' argument into 'concatenate' to better denote that we don't add spaces between each hex bytes. Suggested-by: Steven Rostedt Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/trace_events.h | 2 +- include/trace/trace_events.h | 4 ++-- kernel/trace/trace_output.c | 15 +++++++++++++-- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index cfa475a0e9ca..0f165507495c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -34,7 +34,7 @@ const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, const char *trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len, - bool spacing); + bool concatenate); const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 9f684629c3cf..5c06f4af8323 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -298,11 +298,11 @@ TRACE_MAKE_SYSTEM_STR(); #undef __print_hex #define __print_hex(buf, buf_len) \ - trace_print_hex_seq(p, buf, buf_len, true) + trace_print_hex_seq(p, buf, buf_len, false) #undef __print_hex_str #define __print_hex_str(buf, buf_len) \ - trace_print_hex_seq(p, buf, buf_len, false) + trace_print_hex_seq(p, buf, buf_len, true) #undef __print_array #define __print_array(array, count, el_size) \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 30a144b1b9ee..aea6a1218c7d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -162,15 +162,26 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, } EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); +/** + * trace_print_hex_seq - print buffer as hex sequence + * @p: trace seq struct to write to + * @buf: The buffer to print + * @buf_len: Length of @buf in bytes + * @concatenate: Print @buf as single hex string or with spacing + * + * Prints the passed buffer as a hex sequence either as a whole, + * single hex string if @concatenate is true or with spacing after + * each byte in case @concatenate is false. + */ const char * trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, - bool spacing) + bool concatenate) { int i; const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", !spacing || i == 0 ? "" : " ", + trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", buf[i]); trace_seq_putc(p, 0); -- cgit v1.2.3 From 63dfef75ed75364901d7caa52c6420cec3e73519 Mon Sep 17 00:00:00 2001 From: William Tu Date: Sat, 4 Feb 2017 08:37:29 -0800 Subject: bpf: enable verifier to add 0 to packet ptr The patch fixes the case when adding a zero value to the packet pointer. The zero value could come from src_reg equals type BPF_K or CONST_IMM. The patch fixes both, otherwise the verifer reports the following error: [...] R0=imm0,min_value=0,max_value=0 R1=pkt(id=0,off=0,r=4) R2=pkt_end R3=fp-12 R4=imm4,min_value=4,max_value=4 R5=pkt(id=0,off=4,r=4) 269: (bf) r2 = r0 // r2 becomes imm0 270: (77) r2 >>= 3 271: (bf) r4 = r1 // r4 becomes pkt ptr 272: (0f) r4 += r2 // r4 += 0 addition of negative constant to packet pointer is not allowed Signed-off-by: William Tu Signed-off-by: Mihai Budiu Cc: Daniel Borkmann Cc: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- tools/testing/selftests/bpf/test_verifier.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb3513b35c0b..1a754e5d2695 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1397,7 +1397,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env, imm = insn->imm; add_imm: - if (imm <= 0) { + if (imm < 0) { verbose("addition of negative constant to packet pointer is not allowed\n"); return -EACCES; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index df194e1d56c2..71f6407cde60 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2403,6 +2403,29 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "direct packet access: test14 (pkt_ptr += 0, CONST_IMM, good access)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 7), + BPF_MOV64_IMM(BPF_REG_5, 12), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_5, 4), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, { "helper access to packet: test1, valid packet_ptr range", .insns = { -- cgit v1.2.3 From c502faf94153bd0fedc5389a936f728a659cc6ab Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 8 Feb 2017 01:19:43 +0100 Subject: bpf, lpm: fix overflows in trie_alloc checks Cap the maximum (total) value size and bail out if larger than KMALLOC_MAX_SIZE as otherwise it doesn't make any sense to proceed further, since we're guaranteed to fail to allocate elements anyway in lpm_trie_node_alloc(); likleyhood of failure is still high for large values, though, similarly as with htab case in non-prealloc. Next, make sure that cost vars are really u64 instead of size_t, so that we don't overflow on 32 bit and charge only tiny map.pages against memlock while allowing huge max_entries; cap also the max cost like we do with other map types. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 144e9763102f..e0f6a0bd279b 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -394,10 +394,21 @@ static int trie_delete_elem(struct bpf_map *map, void *key) return -ENOSYS; } +#define LPM_DATA_SIZE_MAX 256 +#define LPM_DATA_SIZE_MIN 1 + +#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ + sizeof(struct lpm_trie_node)) +#define LPM_VAL_SIZE_MIN 1 + +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) +#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) + static struct bpf_map *trie_alloc(union bpf_attr *attr) { - size_t cost, cost_per_node; struct lpm_trie *trie; + u64 cost = sizeof(*trie), cost_per_node; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -406,9 +417,10 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->map_flags != BPF_F_NO_PREALLOC || - attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 || - attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 || - attr->value_size == 0) + attr->key_size < LPM_KEY_SIZE_MIN || + attr->key_size > LPM_KEY_SIZE_MAX || + attr->value_size < LPM_VAL_SIZE_MIN || + attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); @@ -426,18 +438,24 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; - cost = sizeof(*trie) + attr->max_entries * cost_per_node; + cost += (u64) attr->max_entries * cost_per_node; + if (cost >= U32_MAX - PAGE_SIZE) { + ret = -E2BIG; + goto out_err; + } + trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_precharge_memlock(trie->map.pages); - if (ret) { - kfree(trie); - return ERR_PTR(ret); - } + if (ret) + goto out_err; raw_spin_lock_init(&trie->lock); return &trie->map; +out_err: + kfree(trie); + return ERR_PTR(ret); } static void trie_free(struct bpf_map *map) -- cgit v1.2.3 From 7e57fbb2a341b5d44d30e71a6d782c5e6dbc429c Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Tue, 14 Feb 2017 00:02:35 +0100 Subject: bpf: reduce compiler warnings by adding fallthrough comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following warnings: kernel/bpf/verifier.c: In function ‘may_access_direct_pkt_data’: kernel/bpf/verifier.c:702:6: warning: this statement may fall through [-Wimplicit-fallthrough=] if (t == BPF_WRITE) ^ kernel/bpf/verifier.c:704:2: note: here case BPF_PROG_TYPE_SCHED_CLS: ^~~~ kernel/bpf/verifier.c: In function ‘reg_set_min_max_inv’: kernel/bpf/verifier.c:2057:23: warning: this statement may fall through [-Wimplicit-fallthrough=] true_reg->min_value = 0; ~~~~~~~~~~~~~~~~~~~~^~~ kernel/bpf/verifier.c:2058:2: note: here case BPF_JSGT: ^~~~ kernel/bpf/verifier.c:2068:23: warning: this statement may fall through [-Wimplicit-fallthrough=] true_reg->min_value = 0; ~~~~~~~~~~~~~~~~~~~~^~~ kernel/bpf/verifier.c:2069:2: note: here case BPF_JSGE: ^~~~ kernel/bpf/verifier.c: In function ‘reg_set_min_max’: kernel/bpf/verifier.c:2009:24: warning: this statement may fall through [-Wimplicit-fallthrough=] false_reg->min_value = 0; ~~~~~~~~~~~~~~~~~~~~~^~~ kernel/bpf/verifier.c:2010:2: note: here case BPF_JSGT: ^~~~ kernel/bpf/verifier.c:2019:24: warning: this statement may fall through [-Wimplicit-fallthrough=] false_reg->min_value = 0; ~~~~~~~~~~~~~~~~~~~~~^~~ kernel/bpf/verifier.c:2020:2: note: here case BPF_JSGE: ^~~~ Reported-by: David Binderman Signed-off-by: Alexander Alemayhu Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1a754e5d2695..d2bded2b250c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -701,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; + /* fallthrough */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: @@ -2007,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. @@ -2017,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. @@ -2055,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* * If this is false, then the val is <= the register, if it is @@ -2066,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then constant < register, if it is true then * the register < constant. -- cgit v1.2.3 From c78f8bdfa11fcceb9723c61212e4bd8f76c87f9e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Feb 2017 22:24:48 +0100 Subject: bpf: mark all registered map/prog types as __ro_after_init All map types and prog types are registered to the BPF core through bpf_register_map_type() and bpf_register_prog_type() during init and remain unchanged thereafter. As by design we don't (and never will) have any pluggable code that can register to that at any later point in time, lets mark all the existing bpf_{map,prog}_type_list objects in the tree as __ro_after_init, so they can be moved to read-only section from then onwards. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 10 +++++----- kernel/bpf/hashtab.c | 8 ++++---- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/trace/bpf_trace.c | 6 +++--- net/core/filter.c | 18 +++++++++--------- 6 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d55d95dcf49..6b6f41f0b211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list array_type __read_mostly = { +static struct bpf_map_type_list array_type __ro_after_init = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; @@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list percpu_array_type __read_mostly = { +static struct bpf_map_type_list percpu_array_type __ro_after_init = { .ops = &percpu_array_ops, .type = BPF_MAP_TYPE_PERCPU_ARRAY, }; @@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, }; -static struct bpf_map_type_list prog_array_type __read_mostly = { +static struct bpf_map_type_list prog_array_type __ro_after_init = { .ops = &prog_array_ops, .type = BPF_MAP_TYPE_PROG_ARRAY, }; @@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_release = perf_event_fd_array_release, }; -static struct bpf_map_type_list perf_event_array_type __read_mostly = { +static struct bpf_map_type_list perf_event_array_type __ro_after_init = { .ops = &perf_event_array_ops, .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; @@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = { .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; -static struct bpf_map_type_list cgroup_array_type __read_mostly = { +static struct bpf_map_type_list cgroup_array_type __ro_after_init = { .ops = &cgroup_array_ops, .type = BPF_MAP_TYPE_CGROUP_ARRAY, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a753bbe7df0a..3ea87fb19a94 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1023,7 +1023,7 @@ static const struct bpf_map_ops htab_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_type __read_mostly = { +static struct bpf_map_type_list htab_type __ro_after_init = { .ops = &htab_ops, .type = BPF_MAP_TYPE_HASH, }; @@ -1037,7 +1037,7 @@ static const struct bpf_map_ops htab_lru_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_type __read_mostly = { +static struct bpf_map_type_list htab_lru_type __ro_after_init = { .ops = &htab_lru_ops, .type = BPF_MAP_TYPE_LRU_HASH, }; @@ -1124,7 +1124,7 @@ static const struct bpf_map_ops htab_percpu_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_percpu_type __read_mostly = { +static struct bpf_map_type_list htab_percpu_type __ro_after_init = { .ops = &htab_percpu_ops, .type = BPF_MAP_TYPE_PERCPU_HASH, }; @@ -1138,7 +1138,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { +static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .ops = &htab_lru_percpu_ops, .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e0f6a0bd279b..8bfe0afaee10 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -508,7 +508,7 @@ static const struct bpf_map_ops trie_ops = { .map_delete_elem = trie_delete_elem, }; -static struct bpf_map_type_list trie_type __read_mostly = { +static struct bpf_map_type_list trie_type __ro_after_init = { .ops = &trie_ops, .type = BPF_MAP_TYPE_LPM_TRIE, }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be8519148c25..22aa45cd0324 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = { .map_delete_elem = stack_map_delete_elem, }; -static struct bpf_map_type_list stack_map_type __read_mostly = { +static struct bpf_map_type_list stack_map_type __ro_after_init = { .ops = &stack_map_ops, .type = BPF_MAP_TYPE_STACK_TRACE, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 424daa4586d1..cee9802cf3e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -506,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = { .is_valid_access = kprobe_prog_is_valid_access, }; -static struct bpf_prog_type_list kprobe_tl = { +static struct bpf_prog_type_list kprobe_tl __ro_after_init = { .ops = &kprobe_prog_ops, .type = BPF_PROG_TYPE_KPROBE, }; @@ -589,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = { .is_valid_access = tp_prog_is_valid_access, }; -static struct bpf_prog_type_list tracepoint_tl = { +static struct bpf_prog_type_list tracepoint_tl __ro_after_init = { .ops = &tracepoint_prog_ops, .type = BPF_PROG_TYPE_TRACEPOINT, }; @@ -648,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = { .convert_ctx_access = pe_prog_convert_ctx_access, }; -static struct bpf_prog_type_list perf_event_tl = { +static struct bpf_prog_type_list perf_event_tl __ro_after_init = { .ops = &perf_event_prog_ops, .type = BPF_PROG_TYPE_PERF_EVENT, }; diff --git a/net/core/filter.c b/net/core/filter.c index 0b753cbb2536..e466e0040137 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3296,47 +3296,47 @@ static const struct bpf_verifier_ops cg_sock_ops = { .convert_ctx_access = sock_filter_convert_ctx_access, }; -static struct bpf_prog_type_list sk_filter_type __read_mostly = { +static struct bpf_prog_type_list sk_filter_type __ro_after_init = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; -static struct bpf_prog_type_list sched_cls_type __read_mostly = { +static struct bpf_prog_type_list sched_cls_type __ro_after_init = { .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_CLS, }; -static struct bpf_prog_type_list sched_act_type __read_mostly = { +static struct bpf_prog_type_list sched_act_type __ro_after_init = { .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_ACT, }; -static struct bpf_prog_type_list xdp_type __read_mostly = { +static struct bpf_prog_type_list xdp_type __ro_after_init = { .ops = &xdp_ops, .type = BPF_PROG_TYPE_XDP, }; -static struct bpf_prog_type_list cg_skb_type __read_mostly = { +static struct bpf_prog_type_list cg_skb_type __ro_after_init = { .ops = &cg_skb_ops, .type = BPF_PROG_TYPE_CGROUP_SKB, }; -static struct bpf_prog_type_list lwt_in_type __read_mostly = { +static struct bpf_prog_type_list lwt_in_type __ro_after_init = { .ops = &lwt_inout_ops, .type = BPF_PROG_TYPE_LWT_IN, }; -static struct bpf_prog_type_list lwt_out_type __read_mostly = { +static struct bpf_prog_type_list lwt_out_type __ro_after_init = { .ops = &lwt_inout_ops, .type = BPF_PROG_TYPE_LWT_OUT, }; -static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { +static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = { .ops = &lwt_xmit_ops, .type = BPF_PROG_TYPE_LWT_XMIT, }; -static struct bpf_prog_type_list cg_sock_type __read_mostly = { +static struct bpf_prog_type_list cg_sock_type __ro_after_init = { .ops = &cg_sock_ops, .type = BPF_PROG_TYPE_CGROUP_SOCK }; -- cgit v1.2.3 From 9383191da4e40360a5d880fbe6bb03911c61621b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Feb 2017 22:24:49 +0100 Subject: bpf: remove stubs for cBPF from arch code Remove the dummy bpf_jit_compile() stubs for eBPF JITs and make that a single __weak function in the core that can be overridden similarly to the eBPF one. Also remove stale pr_err() mentions of bpf_jit_compile. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm64/net/bpf_jit_comp.c | 5 ----- arch/powerpc/net/bpf_jit_comp64.c | 2 -- arch/s390/net/bpf_jit_comp.c | 8 -------- arch/x86/net/bpf_jit_comp.c | 8 ++------ include/linux/filter.h | 6 +----- kernel/bpf/core.c | 12 +++++++++++- 6 files changed, 14 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index b2fc97a2c56c..c444408d5a8c 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -813,11 +813,6 @@ static inline void bpf_flush_icache(void *start, void *end) flush_icache_range((unsigned long)start, (unsigned long)end); } -void bpf_jit_compile(struct bpf_prog *prog) -{ - /* Nothing to do here. We support Internal BPF. */ -} - struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_prog *tmp, *orig_prog = prog; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 73a5cf18fd84..f9ebd02260da 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -961,8 +961,6 @@ common_load: return 0; } -void bpf_jit_compile(struct bpf_prog *fp) { } - struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { u32 proglen; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 167b31b186c1..6454efd22e63 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1262,14 +1262,6 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) return 0; } -/* - * Classic BPF function stub. BPF programs will be converted into - * eBPF and then bpf_int_jit_compile() will be called. - */ -void bpf_jit_compile(struct bpf_prog *fp) -{ -} - /* * Compile eBPF program "fp" */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index bb660e53cbd6..26123d0ae13a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1067,13 +1067,13 @@ common_load: ilen = prog - temp; if (ilen > BPF_MAX_INSN_SIZE) { - pr_err("bpf_jit_compile fatal insn size error\n"); + pr_err("bpf_jit: fatal insn size error\n"); return -EFAULT; } if (image) { if (unlikely(proglen + ilen > oldproglen)) { - pr_err("bpf_jit_compile fatal error\n"); + pr_err("bpf_jit: fatal error\n"); return -EFAULT; } memcpy(image + proglen, temp, ilen); @@ -1085,10 +1085,6 @@ common_load: return proglen; } -void bpf_jit_compile(struct bpf_prog *prog) -{ -} - struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; diff --git a/include/linux/filter.h b/include/linux/filter.h index e4eb2546339a..c7a70e0cc3a0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,6 +607,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); +void bpf_jit_compile(struct bpf_prog *prog); bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, @@ -625,7 +626,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); -void bpf_jit_compile(struct bpf_prog *fp); void bpf_jit_free(struct bpf_prog *fp); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); @@ -669,10 +669,6 @@ static inline bool bpf_jit_blinding_enabled(void) return true; } #else -static inline void bpf_jit_compile(struct bpf_prog *fp) -{ -} - static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index fddd76b1b627..2831ba1e71c1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1154,12 +1154,22 @@ const struct bpf_func_proto bpf_tail_call_proto = { .arg3_type = ARG_ANYTHING, }; -/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +/* Stub for JITs that only support cBPF. eBPF programs are interpreted. + * It is encouraged to implement bpf_int_jit_compile() instead, so that + * eBPF and implicitly also cBPF can get JITed! + */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } +/* Stub for JITs that support eBPF. All cBPF code gets transformed into + * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). + */ +void __weak bpf_jit_compile(struct bpf_prog *prog) +{ +} + bool __weak bpf_helper_changes_pkt_data(void *func) { return false; -- cgit v1.2.3 From 74451e66d516c55e309e8d89a4a1e7596e46aacd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Feb 2017 22:24:50 +0100 Subject: bpf: make jited programs visible in traces Long standing issue with JITed programs is that stack traces from function tracing check whether a given address is kernel code through {__,}kernel_text_address(), which checks for code in core kernel, modules and dynamically allocated ftrace trampolines. But what is still missing is BPF JITed programs (interpreted programs are not an issue as __bpf_prog_run() will be attributed to them), thus when a stack trace is triggered, the code walking the stack won't see any of the JITed ones. The same for address correlation done from user space via reading /proc/kallsyms. This is read by tools like perf, but the latter is also useful for permanent live tracing with eBPF itself in combination with stack maps when other eBPF types are part of the callchain. See offwaketime example on dumping stack from a map. This work tries to tackle that issue by making the addresses and symbols known to the kernel. The lookup from *kernel_text_address() is implemented through a latched RB tree that can be read under RCU in fast-path that is also shared for symbol/size/offset lookup for a specific given address in kallsyms. The slow-path iteration through all symbols in the seq file done via RCU list, which holds a tiny fraction of all exported ksyms, usually below 0.1 percent. Function symbols are exported as bpf_prog_, in order to aide debugging and attribution. This facility is currently enabled for root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening is active in any mode. The rationale behind this is that still a lot of systems ship with world read permissions on kallsyms thus addresses should not get suddenly exposed for them. If that situation gets much better in future, we always have the option to change the default on this. Likewise, unprivileged programs are not allowed to add entries there either, but that is less of a concern as most such programs types relevant in this context are for root-only anyway. If enabled, call graphs and stack traces will then show a correct attribution; one example is illustrated below, where the trace is now visible in tooling such as perf script --kallsyms=/proc/kallsyms and friends. Before: 7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux) f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so) After: 7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux) [...] 7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux) 7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux) f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so) Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 12 ++ arch/arm64/net/bpf_jit_comp.c | 15 --- arch/powerpc/net/bpf_jit_comp64.c | 1 + arch/s390/net/bpf_jit_comp.c | 18 --- arch/x86/net/bpf_jit_comp.c | 15 --- include/linux/bpf.h | 4 + include/linux/filter.h | 112 ++++++++++++++++++- kernel/bpf/core.c | 223 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 2 + kernel/extable.c | 9 +- kernel/kallsyms.c | 61 +++++++++-- net/Kconfig | 3 +- net/core/sysctl_net_core.c | 7 ++ 13 files changed, 419 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index b80fbd4e5575..2ebabc93014a 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -54,6 +54,18 @@ Values : 1 - enable JIT hardening for unprivileged users only 2 - enable JIT hardening for all users +bpf_jit_kallsyms +---------------- + +When Berkeley Packet Filter Just in Time compiler is enabled, then compiled +images are unknown addresses to the kernel, meaning they neither show up in +traces nor in /proc/kallsyms. This enables export of these addresses, which +can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature +is disabled. +Values : + 0 - disable JIT kallsyms export (default value) + 1 - enable JIT kallsyms export for privileged users only + dev_weight -------------- diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c444408d5a8c..05d12104d270 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -910,18 +910,3 @@ out: tmp : orig_prog); return prog; } - -void bpf_jit_free(struct bpf_prog *prog) -{ - unsigned long addr = (unsigned long)prog->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - if (!prog->jited) - goto free_filter; - - set_memory_rw(addr, header->pages); - bpf_jit_binary_free(header); - -free_filter: - bpf_prog_unlock_free(prog); -} diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index f9ebd02260da..c34166ef76fc 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -1064,6 +1064,7 @@ out: return fp; } +/* Overriding bpf_jit_free() as we don't set images read-only. */ void bpf_jit_free(struct bpf_prog *fp) { unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6454efd22e63..f1d0e62ec1dd 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1339,21 +1339,3 @@ out: tmp : orig_fp); return fp; } - -/* - * Free eBPF program - */ -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - if (!fp->jited) - goto free_filter; - - set_memory_rw(addr, header->pages); - bpf_jit_binary_free(header); - -free_filter: - bpf_prog_unlock_free(fp); -} diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 26123d0ae13a..18a62e208826 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1180,18 +1180,3 @@ out: tmp : orig_prog); return prog; } - -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - if (!fp->jited) - goto free_filter; - - set_memory_rw(addr, header->pages); - bpf_jit_binary_free(header); - -free_filter: - bpf_prog_unlock_free(fp); -} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 57d60dc5b600..909fc033173a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -8,10 +8,12 @@ #define _LINUX_BPF_H 1 #include + #include #include #include #include +#include struct perf_event; struct bpf_map; @@ -177,6 +179,8 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + struct latch_tree_node ksym_tnode; + struct list_head ksym_lnode; const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; diff --git a/include/linux/filter.h b/include/linux/filter.h index c7a70e0cc3a0..0c1cc9143cb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -54,6 +54,12 @@ struct bpf_prog_aux; #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +/* As per nm, we expose JITed images as text (code) section for + * kallsyms. That way, tools like perf can find it to match + * addresses. + */ +#define BPF_SYM_ELF_TYPE 't' + /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 @@ -555,6 +561,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { set_memory_rw((unsigned long)fp, fp->pages); } + +static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) +{ + set_memory_rw((unsigned long)hdr, hdr->pages); +} #else static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { @@ -563,8 +574,21 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { } + +static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) +{ +} #endif /* CONFIG_DEBUG_SET_MODULE_RONX */ +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr = real_start & PAGE_MASK; + + return (void *)addr; +} + int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { @@ -617,6 +641,7 @@ void bpf_warn_invalid_xdp_action(u32 act); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; +extern int bpf_jit_kallsyms; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); @@ -651,6 +676,11 @@ static inline bool bpf_jit_is_ebpf(void) # endif } +static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) +{ + return fp->jited && bpf_jit_is_ebpf(); +} + static inline bool bpf_jit_blinding_enabled(void) { /* These are the prerequisites, should someone ever have the @@ -668,11 +698,91 @@ static inline bool bpf_jit_blinding_enabled(void) return true; } -#else + +static inline bool bpf_jit_kallsyms_enabled(void) +{ + /* There are a couple of corner cases where kallsyms should + * not be enabled f.e. on hardening. + */ + if (bpf_jit_harden) + return false; + if (!bpf_jit_kallsyms) + return false; + if (bpf_jit_kallsyms == 1) + return true; + + return false; +} + +const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym); +bool is_bpf_text_address(unsigned long addr); +int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym); + +static inline const char * +bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char **modname, char *sym) +{ + const char *ret = __bpf_address_lookup(addr, size, off, sym); + + if (ret && modname) + *modname = NULL; + return ret; +} + +void bpf_prog_kallsyms_add(struct bpf_prog *fp); +void bpf_prog_kallsyms_del(struct bpf_prog *fp); + +#else /* CONFIG_BPF_JIT */ + +static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) +{ + return false; +} + static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); } + +static inline bool bpf_jit_kallsyms_enabled(void) +{ + return false; +} + +static inline const char * +__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + return NULL; +} + +static inline bool is_bpf_text_address(unsigned long addr) +{ + return false; +} + +static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +static inline const char * +bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char **modname, char *sym) +{ + return NULL; +} + +static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) +{ +} #endif /* CONFIG_BPF_JIT */ #define BPF_ANC BIT(15) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2831ba1e71c1..f45827e205d3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include #include @@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->aux = aux; fp->aux->prog = fp; + INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); @@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +static __always_inline void +bpf_get_prog_addr_region(const struct bpf_prog *prog, + unsigned long *symbol_start, + unsigned long *symbol_end) +{ + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); + unsigned long addr = (unsigned long)hdr; + + WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); + + *symbol_start = addr; + *symbol_end = addr + hdr->pages * PAGE_SIZE; +} + +static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + BUILD_BUG_ON(sizeof("bpf_prog_") + + sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + + sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); + sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + *sym = 0; +} + +static __always_inline unsigned long +bpf_get_prog_addr_start(struct latch_tree_node *n) +{ + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + return symbol_start; +} + +static __always_inline bool bpf_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); +} + +static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + if (val < symbol_start) + return -1; + if (val >= symbol_end) + return 1; + + return 0; +} + +static const struct latch_tree_ops bpf_tree_ops = { + .less = bpf_tree_less, + .comp = bpf_tree_comp, +}; + +static DEFINE_SPINLOCK(bpf_lock); +static LIST_HEAD(bpf_kallsyms); +static struct latch_tree_root bpf_tree __cacheline_aligned; + +int bpf_jit_kallsyms __read_mostly; + +static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +{ + WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); + list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); + latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +} + +static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +{ + if (list_empty(&aux->ksym_lnode)) + return; + + latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&aux->ksym_lnode); +} + +static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) +{ + return fp->jited && !bpf_prog_was_classic(fp); +} + +static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +{ + return list_empty(&fp->aux->ksym_lnode) || + fp->aux->ksym_lnode.prev == LIST_POISON2; +} + +void bpf_prog_kallsyms_add(struct bpf_prog *fp) +{ + unsigned long flags; + + if (!bpf_prog_kallsyms_candidate(fp) || + !capable(CAP_SYS_ADMIN)) + return; + + spin_lock_irqsave(&bpf_lock, flags); + bpf_prog_ksym_node_add(fp->aux); + spin_unlock_irqrestore(&bpf_lock, flags); +} + +void bpf_prog_kallsyms_del(struct bpf_prog *fp) +{ + unsigned long flags; + + if (!bpf_prog_kallsyms_candidate(fp)) + return; + + spin_lock_irqsave(&bpf_lock, flags); + bpf_prog_ksym_node_del(fp->aux); + spin_unlock_irqrestore(&bpf_lock, flags); +} + +static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +{ + struct latch_tree_node *n; + + if (!bpf_jit_kallsyms_enabled()) + return NULL; + + n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); + return n ? + container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : + NULL; +} + +const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog *prog; + char *ret = NULL; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (prog) { + bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); + bpf_get_prog_name(prog, sym); + + ret = sym; + if (size) + *size = symbol_end - symbol_start; + if (off) + *off = addr - symbol_start; + } + rcu_read_unlock(); + + return ret; +} + +bool is_bpf_text_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = bpf_prog_kallsyms_find(addr) != NULL; + rcu_read_unlock(); + + return ret; +} + +int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog_aux *aux; + unsigned int it = 0; + int ret = -ERANGE; + + if (!bpf_jit_kallsyms_enabled()) + return ret; + + rcu_read_lock(); + list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + if (it++ != symnum) + continue; + + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + bpf_get_prog_name(aux->prog, sym); + + *value = symbol_start; + *type = BPF_SYM_ELF_TYPE; + + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) module_memfree(hdr); } +/* This symbol is only overridden by archs that have different + * requirements than the usual eBPF JITs, f.e. when they only + * implement cBPF JIT, do not set images read-only, etc. + */ +void __weak bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) { + struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + bpf_jit_binary_unlock_ro(hdr); + bpf_jit_binary_free(hdr); + + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); + } + + bpf_prog_unlock_free(fp); +} + int bpf_jit_harden __read_mostly; static int bpf_jit_blind_insn(const struct bpf_insn *from, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f74ca17af64a..461eb1e66a0f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -707,6 +707,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } @@ -903,6 +904,7 @@ static int bpf_prog_load(union bpf_attr *attr) /* failed to allocate fd */ goto free_used_maps; + bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; diff --git a/kernel/extable.c b/kernel/extable.c index e3beec4a2339..bd82117ad424 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_ftrace_trampoline(addr)) return 1; + if (is_bpf_text_address(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; - return is_ftrace_trampoline(addr); + if (is_ftrace_trampoline(addr)) + return 1; + if (is_bpf_text_address(addr)) + return 1; + return 0; } /* diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fafd1a3ef0da..6a3b249a2ae1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; + if (is_ksym_addr(addr)) return !!get_symbol_pos(addr, symbolsize, offset); - - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); + return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || + !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } /* @@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { + const char *ret; + namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr, return namebuf; } - /* See if it's in a module. */ - return module_address_lookup(addr, symbolsize, offset, modname, - namebuf); + /* See if it's in a module or a BPF JITed image. */ + ret = module_address_lookup(addr, symbolsize, offset, + modname, namebuf); + if (!ret) + ret = bpf_address_lookup(addr, symbolsize, + offset, modname, namebuf); + return ret; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol); /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; + loff_t pos_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -481,13 +490,27 @@ struct kallsym_iter { static int get_ksymbol_mod(struct kallsym_iter *iter) { - if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, - &iter->type, iter->name, iter->module_name, - &iter->exported) < 0) + int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + if (ret < 0) { + iter->pos_mod_end = iter->pos; return 0; + } + return 1; } +static int get_ksymbol_bpf(struct kallsym_iter *iter) +{ + iter->module_name[0] = '\0'; + iter->exported = 0; + return bpf_get_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; +} + /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { @@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; + if (new_pos == 0) + iter->pos_mod_end = 0; +} + +static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) +{ + iter->pos = pos; + + if (iter->pos_mod_end > 0 && + iter->pos_mod_end < iter->pos) + return get_ksymbol_bpf(iter); + + if (!get_ksymbol_mod(iter)) + return get_ksymbol_bpf(iter); + + return 1; } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ - if (pos >= kallsyms_num_syms) { - iter->pos = pos; - return get_ksymbol_mod(iter); - } + if (pos >= kallsyms_num_syms) + return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) diff --git a/net/Kconfig b/net/Kconfig index f19c0c3b9589..102f781a0131 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -297,7 +297,8 @@ config BPF_JIT Note, admin should enable this feature changing: /proc/sys/net/core/bpf_jit_enable - /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_kallsyms (optional) config NET_FLOW_LIMIT bool diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index eaa72eb0399c..4ead336e14ea 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -334,6 +334,13 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dointvec, }, + { + .procname = "bpf_jit_kallsyms", + .data = &bpf_jit_kallsyms, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, # endif #endif { -- cgit v1.2.3