From b285fcb760da7aa87d6d31e6c6a4907d82d9299c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:14:19 -0700 Subject: bpf: bump jmp sequence limit The limit of 1024 subsequent jumps was causing otherwise valid programs to be rejected. Bump it to 8192 and make the error more verbose. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..3f8b5443cc67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL @@ -782,8 +782,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; elem->st.speculative |= speculative; - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, "The sequence of %d jumps is too complex.\n", + env->stack_size); goto err; } return &elem->st; -- cgit v1.2.3 From 5d839021675a2e1b76653189cc6a90cfd8e30a69 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:05 -0700 Subject: bpf: cleanup explored_states clean up explored_states to prep for introduction of hashtable No functional changes. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3f8b5443cc67..736b5a0d4848 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5437,6 +5437,17 @@ enum { }; #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) +static struct bpf_verifier_state_list **explored_state( + struct bpf_verifier_env *env, + int idx) +{ + return &env->explored_states[idx]; +} + +static void init_explored_state(struct bpf_verifier_env *env, int idx) +{ + env->explored_states[idx] = STATE_LIST_MARK; +} /* t, w, e - match pseudo-code above: * t - index of current instruction @@ -5462,7 +5473,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) if (e == BRANCH) /* mark branch target for state pruning */ - env->explored_states[w] = STATE_LIST_MARK; + init_explored_state(env, w); if (insn_state[w] == 0) { /* tree-edge */ @@ -5530,9 +5541,9 @@ peek_stack: else if (ret < 0) goto err_free; if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); if (ret == 1) goto peek_stack; @@ -5555,10 +5566,10 @@ peek_stack: * after every call and jump */ if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); } else { /* conditional jump with two edges */ - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -6006,7 +6017,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state_list *sl; int i; - sl = env->explored_states[insn]; + sl = *explored_state(env, insn); if (!sl) return; @@ -6365,7 +6376,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = &env->explored_states[insn_idx]; + pprev = explored_state(env, insn_idx); sl = *pprev; if (!sl) @@ -6452,8 +6463,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } - new_sl->next = env->explored_states[insn_idx]; - env->explored_states[insn_idx] = new_sl; + new_sl->next = *explored_state(env, insn_idx); + *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just -- cgit v1.2.3 From a8f500af0ccffc3d2aaf9018537981cb173865a1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:06 -0700 Subject: bpf: split explored_states split explored_states into prune_point boolean mark and link list of explored states. This removes STATE_LIST_MARK hack and allows marks to be separate from states. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 31 +++++++++++++------------------ 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1305ccbd8fe6..02bba09a0ea1 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -233,6 +233,7 @@ struct bpf_insn_aux_data { int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ u8 alu_state; /* used in combination with alu_limit */ + bool prune_point; unsigned int orig_idx; /* original instruction index */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 736b5a0d4848..6a3e69ba891e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5436,7 +5436,6 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) @@ -5446,7 +5445,7 @@ static struct bpf_verifier_state_list **explored_state( static void init_explored_state(struct bpf_verifier_env *env, int idx) { - env->explored_states[idx] = STATE_LIST_MARK; + env->insn_aux_data[idx].prune_point = true; } /* t, w, e - match pseudo-code above: @@ -6018,10 +6017,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, int i; sl = *explored_state(env, insn); - if (!sl) - return; - - while (sl != STATE_LIST_MARK) { + while (sl) { if (sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) @@ -6376,18 +6372,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = explored_state(env, insn_idx); - sl = *pprev; - - if (!sl) + if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ return 0; + pprev = explored_state(env, insn_idx); + sl = *pprev; + clean_live_states(env, insn_idx, cur); - while (sl != STATE_LIST_MARK) { + while (sl) { if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -8145,13 +8141,12 @@ static void free_states(struct bpf_verifier_env *env) for (i = 0; i < env->prog->len; i++) { sl = env->explored_states[i]; - if (sl) - while (sl != STATE_LIST_MARK) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } } kvfree(env->explored_states); -- cgit v1.2.3 From dc2a4ebc0b44a212fcf72242210e56aa17e7317b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:07 -0700 Subject: bpf: convert explored_states to hash table All prune points inside a callee bpf function most likely will have different callsites. For example, if function foo() is called from two callsites the half of explored states in all prune points in foo() will be useless for subsequent walking of one of those callsites. Fortunately explored_states pruning heuristics keeps the number of states per prune point small, but walking these states is still a waste of cpu time when the callsite of the current state is different from the callsite of the explored state. To improve pruning logic convert explored_states into hash table and use simple insn_idx ^ callsite hash to select hash bucket. This optimization has no effect on programs without bpf2bpf calls and drastically improves programs with calls. In the later case it reduces total memory consumption in 1M scale tests by almost 3 times (peak_states drops from 5752 to 2016). Care should be taken when comparing the states for equivalency. Since the same hash bucket can now contain states with different indices the insn_idx has to be part of verifier_state and compared. Different hash table sizes and different hash functions were explored, but the results were not significantly better vs this patch. They can be improved in the future. Hit/miss heuristic is not counting index miscompare as a miss. Otherwise verifier stats become unstable when experimenting with different hash functions. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 02bba09a0ea1..405b502283c5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -187,6 +187,7 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; + u32 insn_idx; u32 curframe; u32 active_spin_lock; bool speculative; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a3e69ba891e..550091c7a46a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5436,11 +5436,19 @@ enum { BRANCH = 2, }; +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) { - return &env->explored_states[idx]; + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } static void init_explored_state(struct bpf_verifier_env *env, int idx) @@ -6018,7 +6026,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { - if (sl->state.curframe != cur->curframe) + if (sl->state.insn_idx != insn || + sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) @@ -6384,6 +6393,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); while (sl) { + states_cnt++; + if (sl->state.insn_idx != insn_idx) + goto next; if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6401,7 +6413,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - states_cnt++; sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. @@ -6428,6 +6439,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) sl = *pprev; continue; } +next: pprev = &sl->next; sl = *pprev; } @@ -6459,6 +6471,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } + new->insn_idx = insn_idx; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -8138,7 +8151,7 @@ static void free_states(struct bpf_verifier_env *env) if (!env->explored_states) return; - for (i = 0; i < env->prog->len; i++) { + for (i = 0; i < state_htab_size(env); i++) { sl = env->explored_states[i]; while (sl) { @@ -8246,7 +8259,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kvcalloc(env->prog->len, + env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; -- cgit v1.2.3 From 8b401f9ed2441ad9e219953927a842d24ed051fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 May 2019 14:47:45 -0700 Subject: bpf: implement bpf_send_signal() helper This patch tries to solve the following specific use case. Currently, bpf program can already collect stack traces through kernel function get_perf_callchain() when certain events happens (e.g., cache miss counter or cpu clock counter overflows). But such stack traces are not enough for jitted programs, e.g., hhvm (jited php). To get real stack trace, jit engine internal data structures need to be traversed in order to get the real user functions. bpf program itself may not be the best place to traverse the jit engine as the traversing logic could be complex and it is not a stable interface either. Instead, hhvm implements a signal handler, e.g. for SIGALARM, and a set of program locations which it can dump stack traces. When it receives a signal, it will dump the stack in next such program location. Such a mechanism can be implemented in the following way: . a perf ring buffer is created between bpf program and tracing app. . once a particular event happens, bpf program writes to the ring buffer and the tracing app gets notified. . the tracing app sends a signal SIGALARM to the hhvm. But this method could have large delays and causing profiling results skewed. This patch implements bpf_send_signal() helper to send a signal to hhvm in real time, resulting in intended stack traces. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 17 +++++++++++- kernel/trace/bpf_trace.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63e0cf66f01a..68d4470523a0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2672,6 +2672,20 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * + * int bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2782,7 +2796,8 @@ union bpf_attr { FN(strtol), \ FN(strtoul), \ FN(sk_storage_get), \ - FN(sk_storage_delete), + FN(sk_storage_delete), \ + FN(send_signal), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..70029eafc71f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -567,6 +567,63 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; +struct send_signal_irq_work { + struct irq_work irq_work; + struct task_struct *task; + u32 sig; +}; + +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); + +static void do_bpf_send_signal(struct irq_work *entry) +{ + struct send_signal_irq_work *work; + + work = container_of(entry, struct send_signal_irq_work, irq_work); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + struct send_signal_irq_work *work = NULL; + + /* Similar to bpf_probe_write_user, task needs to be + * in a sound condition and kernel memory access be + * permitted in order to send signal to the current + * task. + */ + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(uaccess_kernel())) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + + if (in_nmi()) { + work = this_cpu_ptr(&send_signal_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + return -EBUSY; + + /* Add the current task, which is the target of sending signal, + * to the irq_work. The current task may change when queued + * irq works get executed. + */ + work->task = current; + work->sig = sig; + irq_work_queue(&work->irq_work); + return 0; + } + + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); +} + +static const struct bpf_func_proto bpf_send_signal_proto = { + .func = bpf_send_signal, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -617,6 +674,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; #endif + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; default: return NULL; } @@ -1343,5 +1402,18 @@ static int __init bpf_event_init(void) return 0; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + fs_initcall(bpf_event_init); +subsys_initcall(send_signal_irq_work_init); #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 5327ed3d44b754f5cc51d5b3f18e442eaebacff5 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:12 +0100 Subject: bpf: verifier: mark verified-insn with sub-register zext flag eBPF ISA specification requires high 32-bit cleared when low 32-bit sub-register is written. This applies to destination register of ALU32 etc. JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and AArch64 ISA has the same semantics, so the corresponding JIT back-end doesn't need to do extra work. However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches (PowerPC, SPARC etc) need to do explicit zero extension to meet this requirement, otherwise code like the following will fail. u64_value = (u64) u32_value ... other uses of u64_value This is because compiler could exploit the semantic described above and save those zero extensions for extending u32_value to u64_value, these JIT back-ends are expected to guarantee this through inserting extra zero extensions which however could be a significant increase on the code size. Some benchmarks show there could be ~40% sub-register writes out of total insns, meaning at least ~40% extra code-gen. One observation is these extra zero extensions are not always necessary. Take above code snippet for example, it is possible u32_value will never be casted into a u64, the value of high 32-bit of u32_value then could be ignored and extra zero extension could be eliminated. This patch implements this idea, insns defining sub-registers will be marked when the high 32-bit of the defined sub-register matters. For those unmarked insns, it is safe to eliminate high 32-bit clearnace for them. Algo: - Split read flags into READ32 and READ64. - Record index of insn that does sub-register write. Keep the index inside reg state and update it during verifier insn walking. - A full register read on a sub-register marks its definition insn as needing zero extension on dst register. A new sub-register write overrides the old one. - When propagating read64 during path pruning, also mark any insn defining a sub-register that is read in the pruned path as full-register. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 +++- kernel/bpf/verifier.c | 173 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 171 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 405b502283c5..704ed7971472 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -36,9 +36,11 @@ */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ + REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ + REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ + REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, + REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { @@ -131,6 +133,11 @@ struct bpf_reg_state { * pointing to bpf_func_state. */ u32 frameno; + /* Tracks subreg definition. The stored value is the insn_idx of the + * writing insn. This is safe because subreg_def is used before any insn + * patching which only happens after main verification finished. + */ + s32 subreg_def; enum bpf_reg_liveness live; }; @@ -233,6 +240,7 @@ struct bpf_insn_aux_data { int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ bool prune_point; unsigned int orig_idx; /* original instruction index */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 550091c7a46a..f6b4c7148c3e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -982,6 +982,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); } +#define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) { @@ -992,6 +993,7 @@ static void init_reg_state(struct bpf_verifier_env *env, mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; regs[i].parent = NULL; + regs[i].subreg_def = DEF_NOT_SUBREG; } /* frame pointer */ @@ -1137,7 +1139,7 @@ next: */ static int mark_reg_read(struct bpf_verifier_env *env, const struct bpf_reg_state *state, - struct bpf_reg_state *parent) + struct bpf_reg_state *parent, u8 flag) { bool writes = parent == state->parent; /* Observe write marks */ int cnt = 0; @@ -1152,17 +1154,26 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } - if (parent->live & REG_LIVE_READ) + /* The first condition is more likely to be true than the + * second, checked it first. + */ + if ((parent->live & REG_LIVE_READ) == flag || + parent->live & REG_LIVE_READ64) /* The parentage chain never changes and * this parent was already marked as LIVE_READ. * There is no need to keep walking the chain again and * keep re-marking all parents as LIVE_READ. * This case happens when the same register is read * multiple times without writes into it in-between. + * Also, if parent has the stronger REG_LIVE_READ64 set, + * then no need to set the weak REG_LIVE_READ32. */ break; /* ... then we depend on parent's value */ - parent->live |= REG_LIVE_READ; + parent->live |= flag; + /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ + if (flag == REG_LIVE_READ64) + parent->live &= ~REG_LIVE_READ32; state = parent; parent = state->parent; writes = true; @@ -1174,12 +1185,111 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +/* This function is supposed to be used by the following 32-bit optimization + * code only. It returns TRUE if the source or destination register operates + * on 64-bit, otherwise return FALSE. + */ +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +{ + u8 code, class, op; + + code = insn->code; + class = BPF_CLASS(code); + op = BPF_OP(code); + if (class == BPF_JMP) { + /* BPF_EXIT for "main" will reach here. Return TRUE + * conservatively. + */ + if (op == BPF_EXIT) + return true; + if (op == BPF_CALL) { + /* BPF to BPF call will reach here because of marking + * caller saved clobber with DST_OP_NO_MARK for which we + * don't care the register def because they are anyway + * marked as NOT_INIT already. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) + return false; + /* Helper call will reach here because of arg type + * check, conservatively return TRUE. + */ + if (t == SRC_OP) + return true; + + return false; + } + } + + if (class == BPF_ALU64 || class == BPF_JMP || + /* BPF_END always use BPF_ALU class. */ + (class == BPF_ALU && op == BPF_END && insn->imm == 64)) + return true; + + if (class == BPF_ALU || class == BPF_JMP32) + return false; + + if (class == BPF_LDX) { + if (t != SRC_OP) + return BPF_SIZE(code) == BPF_DW; + /* LDX source must be ptr. */ + return true; + } + + if (class == BPF_STX) { + if (reg->type != SCALAR_VALUE) + return true; + return BPF_SIZE(code) == BPF_DW; + } + + if (class == BPF_LD) { + u8 mode = BPF_MODE(code); + + /* LD_IMM64 */ + if (mode == BPF_IMM) + return true; + + /* Both LD_IND and LD_ABS return 32-bit data. */ + if (t != SRC_OP) + return false; + + /* Implicit ctx ptr. */ + if (regno == BPF_REG_6) + return true; + + /* Explicit source could be any width. */ + return true; + } + + if (class == BPF_ST) + /* The only source register for BPF_ST is a ptr. */ + return true; + + /* Conservatively return true at default. */ + return true; +} + +static void mark_insn_zext(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + s32 def_idx = reg->subreg_def; + + if (def_idx == DEF_NOT_SUBREG) + return; + + env->insn_aux_data[def_idx - 1].zext_dst = true; + /* The dst will be zero extended, so won't be sub-register anymore. */ + reg->subreg_def = DEF_NOT_SUBREG; +} + static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg, *regs = state->regs; + bool rw64; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -1187,6 +1297,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, } reg = ®s[regno]; + rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -1197,7 +1308,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, if (regno == BPF_REG_FP) return 0; - return mark_reg_read(env, reg, reg->parent); + if (rw64) + mark_insn_zext(env, reg); + + return mark_reg_read(env, reg, reg->parent, + rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1205,6 +1320,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } @@ -1384,7 +1500,8 @@ static int check_stack_read(struct bpf_verifier_env *env, state->regs[value_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + reg_state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); return 0; } else { int zeros = 0; @@ -1401,7 +1518,8 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + reg_state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -2110,6 +2228,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); if (reg_type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; + /* A load of ctx field could have different + * actual load size with the one encoded in the + * insn. When the dst is PTR, it is for sure not + * a sub-register. + */ + regs[value_regno].subreg_def = DEF_NOT_SUBREG; } regs[value_regno].type = reg_type; } @@ -2369,7 +2493,8 @@ mark: * the whole slot to be marked as 'read' */ mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent); + state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); } return update_stack_depth(env, state, min_off); } @@ -3333,6 +3458,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + /* helper call returns 64-bit value. */ + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ @@ -4264,6 +4392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -4274,6 +4403,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -5353,6 +5483,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * Already marked as written above. */ mark_reg_unknown(env, regs, BPF_REG_0); + /* ld_abs load up to 32-bit skb data. */ + regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0; } @@ -6309,20 +6441,33 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +/* Return 0 if no propagation happened. Return negative error code if error + * happened. Otherwise, return the propagated bit. + */ static int propagate_liveness_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct bpf_reg_state *parent_reg) { + u8 parent_flag = parent_reg->live & REG_LIVE_READ; + u8 flag = reg->live & REG_LIVE_READ; int err; - if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + /* When comes here, read flags of PARENT_REG or REG could be any of + * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need + * of propagation if PARENT_REG has strongest REG_LIVE_READ64. + */ + if (parent_flag == REG_LIVE_READ64 || + /* Or if there is no read flag from REG. */ + !flag || + /* Or if the read flag from REG is the same as PARENT_REG. */ + parent_flag == flag) return 0; - err = mark_reg_read(env, reg, parent_reg); + err = mark_reg_read(env, reg, parent_reg, flag); if (err) return err; - return 0; + return flag; } /* A write screens off any subsequent reads; but write marks come from the @@ -6356,8 +6501,10 @@ static int propagate_liveness(struct bpf_verifier_env *env, for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { err = propagate_liveness_reg(env, &state_reg[i], &parent_reg[i]); - if (err) + if (err < 0) return err; + if (err == REG_LIVE_READ64) + mark_insn_zext(env, &parent_reg[i]); } /* Propagate stack slots. */ @@ -6367,11 +6514,11 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = &state->stack[i].spilled_ptr; err = propagate_liveness_reg(env, state_reg, parent_reg); - if (err) + if (err < 0) return err; } } - return err; + return 0; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) -- cgit v1.2.3 From b325fbca4b136886885e51f4d36e2adab76596e3 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:13 +0100 Subject: bpf: verifier: mark patched-insn with sub-register zext flag Patched insns do not go through generic verification, therefore doesn't has zero extension information collected during insn walking. We don't bother analyze them at the moment, for any sub-register def comes from them, just conservatively mark it as needing zero extension. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6b4c7148c3e..a6af3166acae 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1269,6 +1269,24 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, return true; } +/* Return TRUE if INSN doesn't have explicit value define. */ +static bool insn_no_def(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + return (class == BPF_JMP || class == BPF_JMP32 || + class == BPF_STX || class == BPF_ST); +} + +/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + if (insn_no_def(insn)) + return false; + + return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +} + static void mark_insn_zext(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -7298,14 +7316,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - u32 off, u32 cnt) +static int adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_prog *new_prog, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn *insn = new_prog->insnsi; + u32 prog_len; int i; + /* aux info at OFF always needs adjustment, no matter fast path + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the + * original insn at old prog. + */ + old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + if (cnt == 1) return 0; + prog_len = new_prog->len; new_data = vzalloc(array_size(prog_len, sizeof(struct bpf_insn_aux_data))); if (!new_data) @@ -7313,8 +7340,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - for (i = off; i < off + cnt - 1; i++) + for (i = off; i < off + cnt - 1; i++) { new_data[i].seen = true; + new_data[i].zext_dst = insn_has_def32(env, insn + i); + } env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -7347,7 +7376,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of env->insn_aux_data[off].orig_idx); return NULL; } - if (adjust_insn_aux_data(env, new_prog->len, off, len)) + if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); return new_prog; -- cgit v1.2.3 From a4b1d3c1ddf6cb441187b6c130a473c16a05a356 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:15 +0100 Subject: bpf: verifier: insert zero extension according to analysis result After previous patches, verifier will mark a insn if it really needs zero extension on dst_reg. It is then for back-ends to decide how to use such information to eliminate unnecessary zero extension code-gen during JIT compilation. One approach is verifier insert explicit zero extension for those insns that need zero extension in a generic way, JIT back-ends then do not generate zero extension for sub-register write at default. However, only those back-ends which do not have hardware zero extension want this optimization. Back-ends like x86_64 and AArch64 have hardware zero extension support that the insertion should be disabled. This patch introduces new target hook "bpf_jit_needs_zext" which returns false at default, meaning verifier zero extension insertion is disabled at default. A back-end could override this hook to return true if it doesn't have hardware support and want verifier insert zero extension explicitly. Offload targets do not use this native target hook, instead, they could get the optimization results using bpf_prog_offload_ops.finalize. NOTE: arches could have diversified features, it is possible for one arch to have hardware zero extension support for some sub-register write insns but not for all. For example, PowerPC, SPARC have zero extended loads, but not for alu32. So when verifier zero extension insertion enabled, these JIT back-ends need to peephole insns to remove those zero extension inserted for insn that actually has hardware zero extension support. The peephole could be as simple as looking the next insn, if it is a special zero extension insn then it is safe to eliminate it if the current insn has hardware zero extension support. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 1 + kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fb3aa2dc975..d98141edb74b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -370,6 +370,7 @@ struct bpf_prog_aux { u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ + bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ diff --git a/include/linux/filter.h b/include/linux/filter.h index bb10ffb88452..ba8b65270e0d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -825,6 +825,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); +bool bpf_jit_needs_zext(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(void) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 242a643af82f..3675b19ecb90 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2090,6 +2090,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func) return false; } +/* Return TRUE if the JIT backend wants verifier to enable sub-register usage + * analysis code and wants explicit zero extension inserted by verifier. + * Otherwise, return FALSE. + */ +bool __weak bpf_jit_needs_zext(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a6af3166acae..d4394a84b9eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7640,6 +7640,38 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } +static int opt_subreg_zext_lo32(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int i, delta = 0, len = env->prog->len; + struct bpf_insn zext_patch[2]; + struct bpf_prog *new_prog; + + zext_patch[1] = BPF_ZEXT_REG(0); + for (i = 0; i < len; i++) { + int adj_idx = i + delta; + struct bpf_insn insn; + + if (!aux[adj_idx].zext_dst) + continue; + + insn = insns[adj_idx]; + zext_patch[0] = insn; + zext_patch[1].dst_reg = insn.dst_reg; + zext_patch[1].src_reg = insn.dst_reg; + new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + insns = new_prog->insnsi; + aux = env->insn_aux_data; + delta += 2; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -8490,6 +8522,15 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + /* do 32-bit optimization after insn patching has done so those patched + * insns could be handled correctly. + */ + if (ret == 0 && bpf_jit_needs_zext() && + !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32(env); + env->prog->aux->verifier_zext = !ret; + } + if (ret == 0) ret = fixup_call_args(env); -- cgit v1.2.3 From c240eff63a1cf1c4edc768e0cfc374811c02f069 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:16 +0100 Subject: bpf: introduce new bpf prog load flags "BPF_F_TEST_RND_HI32" x86_64 and AArch64 perhaps are two arches that running bpf testsuite frequently, however the zero extension insertion pass is not enabled for them because of their hardware support. It is critical to guarantee the pass correction as it is supposed to be enabled at default for a couple of other arches, for example PowerPC, SPARC, arm, NFP etc. Therefore, it would be very useful if there is a way to test this pass on for example x86_64. The test methodology employed by this set is "poisoning" useless bits. High 32-bit of a definition is randomized if it is identified as not used by any later insn. Such randomization is only enabled under testing mode which is gated by the new bpf prog load flags "BPF_F_TEST_RND_HI32". Suggested-by: Alexei Starovoitov Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ kernel/bpf/syscall.c | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 68d4470523a0..7c6aef253173 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -260,6 +260,24 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..3d546b6f4646 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1604,7 +1604,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && -- cgit v1.2.3 From d6c2308c742a655f4598364ab331959639aae166 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:18 +0100 Subject: bpf: verifier: randomize high 32-bit when BPF_F_TEST_RND_HI32 is set This patch randomizes high 32-bit of a definition when BPF_F_TEST_RND_HI32 is set. Suggested-by: Alexei Starovoitov Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4394a84b9eb..2778417e6e0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7640,33 +7640,79 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } -static int opt_subreg_zext_lo32(struct bpf_verifier_env *env) +static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, + const union bpf_attr *attr) { + struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; - int i, delta = 0, len = env->prog->len; - struct bpf_insn zext_patch[2]; struct bpf_prog *new_prog; + bool rnd_hi32; + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; zext_patch[1] = BPF_ZEXT_REG(0); + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; - if (!aux[adj_idx].zext_dst) + insn = insns[adj_idx]; + if (!aux[adj_idx].zext_dst) { + u8 code, class; + u32 imm_rnd; + + if (!rnd_hi32) + continue; + + code = insn.code; + class = BPF_CLASS(code); + if (insn_no_def(&insn)) + continue; + + /* NOTE: arg "reg" (the fourth one) is only used for + * BPF_STX which has been ruled out in above + * check, it is safe to pass NULL here. + */ + if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (class == BPF_LD && + BPF_MODE(code) == BPF_IMM) + i++; + continue; + } + + /* ctx load could be transformed into wider load. */ + if (class == BPF_LDX && + aux[adj_idx].ptr_type == PTR_TO_CTX) + continue; + + imm_rnd = get_random_int(); + rnd_hi32_patch[0] = insn; + rnd_hi32_patch[1].imm = imm_rnd; + rnd_hi32_patch[3].dst_reg = insn.dst_reg; + patch = rnd_hi32_patch; + patch_len = 4; + goto apply_patch_buffer; + } + + if (!bpf_jit_needs_zext()) continue; - insn = insns[adj_idx]; zext_patch[0] = insn; zext_patch[1].dst_reg = insn.dst_reg; zext_patch[1].src_reg = insn.dst_reg; - new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2); + patch = zext_patch; + patch_len = 2; +apply_patch_buffer: + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); if (!new_prog) return -ENOMEM; env->prog = new_prog; insns = new_prog->insnsi; aux = env->insn_aux_data; - delta += 2; + delta += patch_len - 1; } return 0; @@ -8525,10 +8571,10 @@ skip_full_check: /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ - if (ret == 0 && bpf_jit_needs_zext() && - !bpf_prog_is_dev_bound(env->prog->aux)) { - ret = opt_subreg_zext_lo32(env); - env->prog->aux->verifier_zext = !ret; + if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret + : false; } if (ret == 0) -- cgit v1.2.3 From e1afb70252a8614e1ef7aec05ff1b84fd324b782 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 25 May 2019 11:57:53 -0700 Subject: bpf: check signal validity in nmi for bpf_send_signal() helper Commit 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") introduced bpf_send_signal() helper. If the context is nmi, the sending signal work needs to be deferred to irq_work. If the signal is invalid, the error will appear in irq_work and it won't be propagated to user. This patch did an early check in the helper itself to notify user invalid signal, as suggested by Daniel. Suggested-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 70029eafc71f..fe73926a07cd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -600,6 +600,12 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EPERM; if (in_nmi()) { + /* Do an early check on signal validity. Otherwise, + * the error is lost in deferred irq_work. + */ + if (unlikely(!valid_signal(sig))) + return -EINVAL; + work = this_cpu_ptr(&send_signal_work); if (work->irq_work.flags & IRQ_WORK_BUSY) return -EBUSY; -- cgit v1.2.3 From 4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:39 -0700 Subject: bpf: decouple the lifetime of cgroup_bpf from cgroup itself Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Signed-off-by: Roman Gushchin Cc: jolsa@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 11 +++++++++-- include/linux/cgroup.h | 18 ++++++++++++++++++ kernel/bpf/cgroup.c | 41 +++++++++++++++++++++++++++++++++++++---- kernel/cgroup/cgroup.c | 11 ++++++++--- 4 files changed, 72 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index cb3c6b3b89c8..9f100fc422c3 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -72,10 +73,16 @@ struct cgroup_bpf { /* temp storage for effective prog array used by prog_attach/detach */ struct bpf_prog_array __rcu *inactive; + + /* reference counter used to detach bpf programs after cgroup removal */ + struct percpu_ref refcnt; + + /* cgroup_bpf is released using a work queue */ + struct work_struct release_work; }; -void cgroup_bpf_put(struct cgroup *cgrp); int cgroup_bpf_inherit(struct cgroup *cgrp); +void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); @@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, struct bpf_prog; struct cgroup_bpf {}; -static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0077adeea83..49e8facf7c4a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task) #endif /* !CONFIG_CGROUPS */ +#ifdef CONFIG_CGROUP_BPF +static inline void cgroup_bpf_get(struct cgroup *cgrp) +{ + percpu_ref_get(&cgrp->bpf.refcnt); +} + +static inline void cgroup_bpf_put(struct cgroup *cgrp) +{ + percpu_ref_put(&cgrp->bpf.refcnt); +} + +#else /* CONFIG_CGROUP_BPF */ + +static inline void cgroup_bpf_get(struct cgroup *cgrp) {} +static inline void cgroup_bpf_put(struct cgroup *cgrp) {} + +#endif /* CONFIG_CGROUP_BPF */ + #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fcde0f7b2585..d995edbe816d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,12 +22,21 @@ DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +void cgroup_bpf_offline(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + percpu_ref_kill(&cgrp->bpf.refcnt); +} + /** - * cgroup_bpf_put() - put references of all bpf programs - * @cgrp: the cgroup to modify + * cgroup_bpf_release() - put references of all bpf programs and + * release all cgroup bpf data + * @work: work structure embedded into the cgroup to modify */ -void cgroup_bpf_put(struct cgroup *cgrp) +static void cgroup_bpf_release(struct work_struct *work) { + struct cgroup *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; unsigned int type; @@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp) } bpf_prog_array_free(cgrp->bpf.effective[type]); } + + percpu_ref_exit(&cgrp->bpf.refcnt); + cgroup_put(cgrp); +} + +/** + * cgroup_bpf_release_fn() - callback used to schedule releasing + * of bpf cgroup data + * @ref: percpu ref counter structure + */ +static void cgroup_bpf_release_fn(struct percpu_ref *ref) +{ + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); + queue_work(system_wq, &cgrp->bpf.release_work); } /* count number of elements in the list. @@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array __rcu *arrays[NR] = {}; - int i; + int ret, i; + + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, + GFP_KERNEL); + if (ret) + return ret; for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + + percpu_ref_exit(&cgrp->bpf.refcnt); + return -ENOMEM; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..ef9cfbfc82a9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - - cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); + cgroup_bpf_offline(cgrp); + /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) * Don't use cgroup_get_live(). */ cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); return; } @@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { skcd->val = (unsigned long)cset->dfl_cgrp; + cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); @@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) void cgroup_sk_free(struct sock_cgroup_data *skcd) { - cgroup_put(sock_cgroup_ptr(skcd)); + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + cgroup_bpf_put(cgrp); + cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ -- cgit v1.2.3 From 54e9c9d4b506b611228890752d1cfa960e0965e1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:41 -0700 Subject: bpf: remove __rcu annotations from bpf_prog_array Drop __rcu annotations and rcu read sections from bpf_prog_array helper functions. They are not needed since all existing callers call those helpers from the rcu update side while holding a mutex. This guarantees that use-after-free could not happen. In the next patches I'll fix the callers with missing rcu_dereference_protected to make sparse/lockdep happy, the proper way to use these helpers is: struct bpf_prog_array __rcu *progs = ...; struct bpf_prog_array *p; mutex_lock(&mtx); p = rcu_dereference_protected(progs, lockdep_is_held(&mtx)); bpf_prog_array_length(p); bpf_prog_array_copy_to_user(p, ...); bpf_prog_array_delete_safe(p, ...); bpf_prog_array_copy_info(p, ...); bpf_prog_array_copy(p, ...); bpf_prog_array_free(p); mutex_unlock(&mtx); No functional changes! rcu_dereference_protected with lockdep_is_held should catch any cases where we update prog array without a mutex (I've looked at existing call sites and I think we hold a mutex everywhere). Motivation is to fix sparse warnings: kernel/bpf/core.c:1803:9: warning: incorrect type in argument 1 (different address spaces) kernel/bpf/core.c:1803:9: expected struct callback_head *head kernel/bpf/core.c:1803:9: got struct callback_head [noderef] * kernel/bpf/core.c:1877:44: warning: incorrect type in initializer (different address spaces) kernel/bpf/core.c:1877:44: expected struct bpf_prog_array_item *item kernel/bpf/core.c:1877:44: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1901:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1901:26: expected struct bpf_prog_array_item *existing kernel/bpf/core.c:1901:26: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1935:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1935:26: expected struct bpf_prog_array_item *[assigned] existing kernel/bpf/core.c:1935:26: got struct bpf_prog_array_item [noderef] * v2: * remove comment about potential race; that can't happen because all callers are in rcu-update section Cc: Roman Gushchin Acked-by: Roman Gushchin Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 12 ++++++------ kernel/bpf/core.c | 37 +++++++++++++------------------------ 2 files changed, 19 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d98141edb74b..ff3e00ff84d2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -514,17 +514,17 @@ struct bpf_prog_array { }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_free(struct bpf_prog_array *progs); +int bpf_prog_array_length(struct bpf_prog_array *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3675b19ecb90..33fb292f2e30 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,38 +1795,33 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) return &empty_prog_array.hdr; } -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || - progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + if (!progs || progs == &empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *array) +int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; - rcu_read_lock(); - item = rcu_dereference(array)->items; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; - rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, +static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference_check(array, 1)->items; - for (; item->prog; item++) { + for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; @@ -1839,7 +1834,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; @@ -1850,18 +1845,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); - * so below kcalloc doesn't need extra cnt > 0 check, but - * bpf_prog_array_length() releases rcu lock and - * prog array could have been swapped with empty or larger array, - * so always copy 'cnt' prog_ids to the user. - * In a rare race the user will see zero prog_ids + * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; - rcu_read_lock(); nospc = bpf_prog_array_copy_core(array, ids, cnt); - rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) @@ -1871,19 +1860,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, +void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { - struct bpf_prog_array_item *item = array->items; + struct bpf_prog_array_item *item; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array) @@ -1947,7 +1936,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { -- cgit v1.2.3 From dbcc1ba26e43bd32cb308e50ac4cb4a29d2f5967 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:43 -0700 Subject: bpf: cgroup: properly use bpf_prog_array api Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. We also don't need __rcu annotations on cgroup_bpf.inactive since it's not read/updated concurrently. v4: * drop cgroup_rcu_xyz wrappers and use rcu APIs directly; presumably should be more clear to understand which mutex/refcount protects each particular place v3: * amend cgroup_rcu_dereference to include percpu_ref_is_dying; cgroup_bpf is now reference counted and we don't hold cgroup_mutex anymore in cgroup_bpf_release v2: * replace xchg with rcu_swap_protected Cc: Roman Gushchin Signed-off-by: Stanislav Fomichev Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- kernel/bpf/cgroup.c | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9f100fc422c3..b631ee75762d 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -72,7 +72,7 @@ struct cgroup_bpf { u32 flags[MAX_BPF_ATTACH_TYPE]; /* temp storage for effective prog array used by prog_attach/detach */ - struct bpf_prog_array __rcu *inactive; + struct bpf_prog_array *inactive; /* reference counter used to detach bpf programs after cgroup removal */ struct percpu_ref refcnt; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d995edbe816d..ff594eb86fd7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -38,6 +38,7 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *cgrp = container_of(work, struct cgroup, bpf.release_work); enum bpf_cgroup_storage_type stype; + struct bpf_prog_array *old_array; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -54,7 +55,10 @@ static void cgroup_bpf_release(struct work_struct *work) kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_prog_array_free(cgrp->bpf.effective[type]); + old_array = rcu_dereference_protected( + cgrp->bpf.effective[type], + percpu_ref_is_dying(&cgrp->bpf.refcnt)); + bpf_prog_array_free(old_array); } percpu_ref_exit(&cgrp->bpf.refcnt); @@ -126,7 +130,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu **array) + struct bpf_prog_array **array) { enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; @@ -164,17 +168,16 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); - rcu_assign_pointer(*array, progs); + *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu *array) + struct bpf_prog_array *old_array) { - struct bpf_prog_array __rcu *old_array; - - old_array = xchg(&cgrp->bpf.effective[type], array); + rcu_swap_protected(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ @@ -191,7 +194,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) - struct bpf_prog_array __rcu *arrays[NR] = {}; + struct bpf_prog_array *arrays[NR] = {}; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -477,10 +480,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog_array *effective; int cnt, ret = 0, i; + effective = rcu_dereference_protected(cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + cnt = bpf_prog_array_length(effective); else cnt = prog_list_length(progs); @@ -497,8 +504,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct bpf_prog_list *pl; u32 id; -- cgit v1.2.3 From e672db03ab0e43e41ab6f8b2156a10d6e40f243d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:44 -0700 Subject: bpf: tracing: properly use bpf_prog_array api Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fe73926a07cd..3994a231eb92 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -19,6 +19,9 @@ #include "trace_probe.h" #include "trace.h" +#define bpf_event_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -1099,7 +1102,7 @@ static DEFINE_MUTEX(bpf_event_mutex); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; @@ -1117,7 +1120,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, if (event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; @@ -1140,7 +1143,7 @@ unlock: void perf_event_detach_bpf_prog(struct perf_event *event) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret; @@ -1149,7 +1152,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) if (!event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret == -ENOENT) goto unlock; @@ -1171,6 +1174,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; @@ -1195,10 +1199,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) */ mutex_lock(&bpf_event_mutex); - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - ids, - ids_len, - &prog_cnt); + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || -- cgit v1.2.3 From 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:36 -0700 Subject: bpf: cgroup inet skb programs can return 0 to 3 Allows cgroup inet skb programs to return values in the range [0, 3]. The second bit is used to deterine if congestion occurred and higher level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr() The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS at load time if it uses the new return values (i.e. 2 or 3). The expected_attach_type is currently not enforced for BPF_PROG_TYPE_CGROUP_SKB. e.g Meaning the current bpf_prog with expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to BPF_CGROUP_INET_INGRESS. Blindly enforcing expected_attach_type will break backward compatibility. This patch adds a enforce_expected_attach_type bit to only enforce the expected_attach_type when it uses the new return value. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++- kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 16 +++++++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index ba8b65270e0d..43b45d6db36d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -526,7 +526,8 @@ struct bpf_prog { blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1; /* callchain buffer allocated? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3d546b6f4646..1539774d78c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2778417e6e0c..5c2cb5bd84ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: @@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } -- cgit v1.2.3 From e7a3160d092aa4dd7fb2ef23335cb3a98400aec6 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:37 -0700 Subject: bpf: Update __cgroup_bpf_run_filter_skb with cn For egress packets, __cgroup_bpf_fun_filter_skb() will now call BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() instead of PROG_CGROUP_RUN_ARRAY() in order to propagate congestion notifications (cn) requests to TCP callers. For egress packets, this function can return: NET_XMIT_SUCCESS (0) - continue with packet output NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr NET_XMIT_CN (2) - continue with packet output and notify TCP to call cwr -EPERM - drop packet For ingress packets, this function will return -EPERM if any attached program was found and if it returned != 1 during execution. Otherwise 0 is returned. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ff594eb86fd7..1b65ab0df457 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); -- cgit v1.2.3 From ffc8b144d5d056dd0ab8d995c7345cdd6a589fc7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:55 -0700 Subject: bpf: add memlock precharge check for cgroup_local_storage Cgroup local storage maps lack the memlock precharge check, which is performed before the memory allocation for most other bpf map types. Let's add it in order to unify all map types. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 980e8f1f6cb5..e48302ecb389 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + u32 pages; + int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); @@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); + pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> + PAGE_SHIFT; + ret = bpf_map_precharge_memlock(pages); + if (ret < 0) + return ERR_PTR(ret); + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); if (!map) return ERR_PTR(-ENOMEM); - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), - PAGE_SIZE) >> PAGE_SHIFT; + map->map.pages = pages; /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); -- cgit v1.2.3 From 3539b96e041c06e4317082816d90ec09160aeb11 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:57 -0700 Subject: bpf: group memory related fields in struct bpf_map_memory Group "user" and "pages" fields of bpf_map into the bpf_map_memory structure. Later it can be extended with "memcg" and other related information. The main reason for a such change (beside cosmetics) is to pass bpf_map_memory structure to charging functions before the actual allocation of bpf_map. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 +++++++--- kernel/bpf/arraymap.c | 2 +- kernel/bpf/cpumap.c | 4 ++-- kernel/bpf/devmap.c | 4 ++-- kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 4 ++-- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/bpf/syscall.c | 19 ++++++++++--------- kernel/bpf/xskmap.c | 4 ++-- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 4 ++-- 14 files changed, 36 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2cc58fc0f413..2e7c1c40d949 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -66,6 +66,11 @@ struct bpf_map_ops { u64 imm, u32 *off); }; +struct bpf_map_memory { + u32 pages; + struct user_struct *user; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -86,7 +91,7 @@ struct bpf_map { u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; - u32 pages; + struct bpf_map_memory memory; bool unpriv_array; bool frozen; /* write-once */ /* 48 bytes hole */ @@ -94,8 +99,7 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - struct user_struct *user ____cacheline_aligned; - atomic_t refcnt; + atomic_t refcnt ____cacheline_aligned; atomic_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 584636c9e2eb..8fda24e78193 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -138,7 +138,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + array->map.memory.pages = cost; array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index cf727d77c6c6..035268add724 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_cmap; - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.pages); + ret = bpf_map_precharge_memlock(cmap->map.memory.pages); if (ret) { err = ret; goto free_cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..f6c57efb1d0d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,10 +111,10 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.pages); + err = bpf_map_precharge_memlock(dtab->map.memory.pages); if (err) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0f2708fde5f7..15bf228d2e98 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -364,10 +364,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.pages); + err = bpf_map_precharge_memlock(htab->map.memory.pages); if (err) goto free_htab; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e48302ecb389..574325276650 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -303,7 +303,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (!map) return ERR_PTR(-ENOMEM); - map->map.pages = pages; + map->map.memory.pages = pages; /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..8e423a582760 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -578,9 +578,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) goto out_err; } - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(trie->map.pages); + ret = bpf_map_precharge_memlock(trie->map.memory.pages); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0b140d236889..8a510e71d486 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -89,7 +89,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&qs->map, attr); - qs->map.pages = cost; + qs->map.memory.pages = cost; qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 18e225de80ff..819515242739 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -176,7 +176,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + array->map.memory.pages = cost; return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..08d4efff73ac 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -131,9 +131,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(smap->map.pages); + err = bpf_map_precharge_memlock(smap->map.memory.pages); if (err) goto free_smap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1539774d78c7..8289a2ce14fc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -222,19 +222,20 @@ static int bpf_map_init_memlock(struct bpf_map *map) struct user_struct *user = get_current_user(); int ret; - ret = bpf_charge_memlock(user, map->pages); + ret = bpf_charge_memlock(user, map->memory.pages); if (ret) { free_uid(user); return ret; } - map->user = user; + map->memory.user = user; return ret; } static void bpf_map_release_memlock(struct bpf_map *map) { - struct user_struct *user = map->user; - bpf_uncharge_memlock(user, map->pages); + struct user_struct *user = map->memory.user; + + bpf_uncharge_memlock(user, map->memory.pages); free_uid(user); } @@ -242,17 +243,17 @@ int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) { int ret; - ret = bpf_charge_memlock(map->user, pages); + ret = bpf_charge_memlock(map->memory.user, pages); if (ret) return ret; - map->pages += pages; + map->memory.pages += pages; return ret; } void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) { - bpf_uncharge_memlock(map->user, pages); - map->pages -= pages; + bpf_uncharge_memlock(map->memory.user, pages); + map->memory.pages -= pages; } static int bpf_map_alloc_id(struct bpf_map *map) @@ -395,7 +396,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT, + map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 686d244e798d..f816ee1a0fa0 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -40,10 +40,10 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_m; - m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.pages); + err = bpf_map_precharge_memlock(m->map.memory.pages); if (err) goto free_m; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 9a8aaf8e235d..92581c3ff220 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -659,7 +659,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = pages; + smap->map.memory.pages = pages; return &smap->map; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index be6092ac69f8..4eb5b6a1b29f 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; } - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.pages); + stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(stab->map.memory.pages); if (err) goto free_stab; -- cgit v1.2.3 From b936ca643ade11f265fa10e5fb71c20d9c5243f1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:58 -0700 Subject: bpf: rework memlock-based memory accounting for maps In order to unify the existing memlock charging code with the memcg-based memory accounting, which will be added later, let's rework the current scheme. Currently the following design is used: 1) .alloc() callback optionally checks if the allocation will likely succeed using bpf_map_precharge_memlock() 2) .alloc() performs actual allocations 3) .alloc() callback calculates map cost and sets map.memory.pages 4) map_create() calls bpf_map_init_memlock() which sets map.memory.user and performs actual charging; in case of failure the map is destroyed 1) bpf_map_free_deferred() calls bpf_map_release_memlock(), which performs uncharge and releases the user 2) .map_free() callback releases the memory The scheme can be simplified and made more robust: 1) .alloc() calculates map cost and calls bpf_map_charge_init() 2) bpf_map_charge_init() sets map.memory.user and performs actual charge 3) .alloc() performs actual allocations 1) .map_free() callback releases the memory 2) bpf_map_charge_finish() performs uncharge and releases the user The new scheme also allows to reuse bpf_map_charge_init()/finish() functions for memcg-based accounting. Because charges are performed before actual allocations and uncharges after freeing the memory, no bogus memory pressure can be created. In cases when the map structure is not available (e.g. it's not created yet, or is already destroyed), on-stack bpf_map_memory structure is used. The charge can be transferred with the bpf_map_charge_move() function. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++- kernel/bpf/arraymap.c | 10 +++++-- kernel/bpf/cpumap.c | 8 +++-- kernel/bpf/devmap.c | 13 ++++---- kernel/bpf/hashtab.c | 11 +++---- kernel/bpf/local_storage.c | 9 ++++-- kernel/bpf/lpm_trie.c | 5 ++-- kernel/bpf/queue_stack_maps.c | 9 ++++-- kernel/bpf/reuseport_array.c | 9 ++++-- kernel/bpf/stackmap.c | 30 +++++++++++-------- kernel/bpf/syscall.c | 69 +++++++++++++++++++++---------------------- kernel/bpf/xskmap.c | 9 +++--- net/core/bpf_sk_storage.c | 8 +++-- net/core/sock_map.c | 5 ++-- 14 files changed, 112 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2e7c1c40d949..3c8f24f402bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -650,9 +650,12 @@ struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -int bpf_map_precharge_memlock(u32 pages); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); +int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages); +void bpf_map_charge_finish(struct bpf_map_memory *mem); +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8fda24e78193..3552da4407d9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -83,6 +83,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; + struct bpf_map_memory mem; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -125,23 +126,26 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.memory.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 035268add724..c633c8d68023 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_cmap; - cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.memory.pages); + ret = bpf_map_charge_init(&cmap->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (ret) { err = ret; goto free_cmap; @@ -121,7 +121,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), __alignof__(unsigned long)); if (!cmap->flush_needed) - goto free_cmap; + goto free_charge; /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * @@ -133,6 +133,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: free_percpu(cmap->flush_needed); +free_charge: + bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6c57efb1d0d..371bd880ed58 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,10 +111,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; - dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.memory.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&dtab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_dtab; @@ -125,19 +124,21 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) __alignof__(unsigned long), GFP_KERNEL | __GFP_NOWARN); if (!dtab->flush_needed) - goto free_dtab; + goto free_charge; dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_dtab; + goto free_charge; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; +free_charge: + bpf_map_charge_finish(&dtab->map.memory); free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 15bf228d2e98..b0bdc7b040ad 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -364,10 +364,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.memory.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&htab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_htab; @@ -376,7 +375,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_charge; if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; @@ -409,6 +408,8 @@ free_prealloc: prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); +free_charge: + bpf_map_charge_finish(&htab->map.memory); free_htab: kfree(htab); return ERR_PTR(err); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 574325276650..e49bfd4f4f6d 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + struct bpf_map_memory mem; u32 pages; int ret; @@ -294,16 +295,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(pages); + ret = bpf_map_charge_init(&mem, pages); if (ret < 0) return ERR_PTR(ret); map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); - if (!map) + if (!map) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } - map->map.memory.pages = pages; + bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8e423a582760..6345a8d2dcd0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -578,9 +578,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) goto out_err; } - trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - ret = bpf_map_precharge_memlock(trie->map.memory.pages); + ret = bpf_map_charge_init(&trie->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8a510e71d486..224cb0fd8f03 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -67,6 +67,7 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem = {0}; struct bpf_queue_stack *qs; u64 size, queue_size, cost; @@ -77,19 +78,21 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) + if (!qs) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - qs->map.memory.pages = cost; + bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 819515242739..5c6e25b1b9b1 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,6 +151,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; + struct bpf_map_memory mem; u64 cost, array_size; if (!capable(CAP_SYS_ADMIN)) @@ -165,18 +166,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(cost); + err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.memory.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 08d4efff73ac..8da24ca65d97 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -89,6 +89,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; + struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -116,40 +117,43 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); + err = bpf_map_charge_init(&mem, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + if (err) + return ERR_PTR(err); + smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) + if (!smap) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); - - err = -E2BIG; - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_smap; + } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(smap->map.memory.pages); - if (err) - goto free_smap; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_smap; + goto free_charge; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; + bpf_map_charge_move(&smap->map.memory, &mem); + return &smap->map; put_buffers: put_callchain_buffers(); -free_smap: +free_charge: + bpf_map_charge_finish(&mem); bpf_map_area_free(smap); return ERR_PTR(err); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8289a2ce14fc..4a5ebad99154 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -188,19 +188,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -int bpf_map_precharge_memlock(u32 pages) -{ - struct user_struct *user = get_current_user(); - unsigned long memlock_limit, cur; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - cur = atomic_long_read(&user->locked_vm); - free_uid(user); - if (cur + pages > memlock_limit) - return -EPERM; - return 0; -} - static int bpf_charge_memlock(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -214,29 +201,40 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages) static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) { - atomic_long_sub(pages, &user->locked_vm); + if (user) + atomic_long_sub(pages, &user->locked_vm); } -static int bpf_map_init_memlock(struct bpf_map *map) +int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages) { struct user_struct *user = get_current_user(); int ret; - ret = bpf_charge_memlock(user, map->memory.pages); + ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); return ret; } - map->memory.user = user; - return ret; + + mem->pages = pages; + mem->user = user; + + return 0; } -static void bpf_map_release_memlock(struct bpf_map *map) +void bpf_map_charge_finish(struct bpf_map_memory *mem) { - struct user_struct *user = map->memory.user; + bpf_uncharge_memlock(mem->user, mem->pages); + free_uid(mem->user); +} - bpf_uncharge_memlock(user, map->memory.pages); - free_uid(user); +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src) +{ + *dst = *src; + + /* Make sure src will not be used for the redundant uncharging. */ + memset(src, 0, sizeof(struct bpf_map_memory)); } int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) @@ -304,11 +302,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct bpf_map_memory mem; - bpf_map_release_memlock(map); + bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); + bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -550,6 +550,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -574,7 +575,7 @@ static int map_create(union bpf_attr *attr) err = bpf_obj_name_cpy(map->name, attr->map_name); if (err) - goto free_map_nouncharge; + goto free_map; atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -584,20 +585,20 @@ static int map_create(union bpf_attr *attr) if (!attr->btf_value_type_id) { err = -EINVAL; - goto free_map_nouncharge; + goto free_map; } btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); - goto free_map_nouncharge; + goto free_map; } err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) { btf_put(btf); - goto free_map_nouncharge; + goto free_map; } map->btf = btf; @@ -609,15 +610,11 @@ static int map_create(union bpf_attr *attr) err = security_bpf_map_alloc(map); if (err) - goto free_map_nouncharge; - - err = bpf_map_init_memlock(map); - if (err) - goto free_map_sec; + goto free_map; err = bpf_map_alloc_id(map); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -633,13 +630,13 @@ static int map_create(union bpf_attr *attr) return err; -free_map: - bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); -free_map_nouncharge: +free_map: btf_put(map->btf); + bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); + bpf_map_charge_finish(&mem); return err; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index f816ee1a0fa0..a329dab7c7a4 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -40,10 +40,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_m; - m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.memory.pages); + err = bpf_map_charge_init(&m->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_m; @@ -51,7 +50,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) - goto free_m; + goto free_charge; for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); @@ -65,6 +64,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) free_percpu: free_percpu(m->flush_list); +free_charge: + bpf_map_charge_finish(&m->map.memory); free_m: kfree(m); return ERR_PTR(err); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 92581c3ff220..621a0b07ff11 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -640,13 +640,16 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(pages); - if (ret < 0) + ret = bpf_map_charge_init(&smap->map.memory, pages); + if (ret < 0) { + kfree(smap); return ERR_PTR(ret); + } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } @@ -659,7 +662,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.memory.pages = pages; return &smap->map; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4eb5b6a1b29f..1028c922a149 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; } - stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.memory.pages); + err = bpf_map_charge_init(&stab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_stab; @@ -60,6 +60,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (stab->sks) return &stab->map; err = -ENOMEM; + bpf_map_charge_finish(&stab->map.memory); free_stab: kfree(stab); return ERR_PTR(err); -- cgit v1.2.3 From c85d69135a9175c50a823d04d62d932312d037b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:59 -0700 Subject: bpf: move memory size checks to bpf_map_charge_init() Most bpf map types doing similar checks and bytes to pages conversion during memory allocation and charging. Let's unify these checks by moving them into bpf_map_charge_init(). Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/arraymap.c | 8 +------- kernel/bpf/cpumap.c | 5 +---- kernel/bpf/devmap.c | 5 +---- kernel/bpf/hashtab.c | 7 +------ kernel/bpf/local_storage.c | 5 +---- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/queue_stack_maps.c | 4 ---- kernel/bpf/reuseport_array.c | 10 ++-------- kernel/bpf/stackmap.c | 8 +------- kernel/bpf/syscall.c | 9 +++++++-- kernel/bpf/xskmap.c | 5 +---- net/core/bpf_sk_storage.c | 4 +--- net/core/sock_map.c | 8 +------- 14 files changed, 20 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3c8f24f402bf..e5a309e6a400 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -652,7 +652,7 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages); +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3552da4407d9..0349cbf23cdb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -117,14 +117,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* make sure there is no u32 overflow later in round_up() */ cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - if (percpu) { + if (percpu) cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - } - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_charge_init(&mem, cost); if (ret < 0) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c633c8d68023..b31a71909307 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -106,12 +106,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_cmap; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_charge_init(&cmap->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + ret = bpf_map_charge_init(&cmap->map.memory, cost); if (ret) { err = ret; goto free_cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 371bd880ed58..5ae7cce5ef16 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -108,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += dev_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_dtab; /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b0bdc7b040ad..d92e05d9979b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -360,13 +360,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else cost += (u64) htab->elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - /* make sure page count doesn't overflow */ - goto free_htab; - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&htab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&htab->map.memory, cost); if (err) goto free_htab; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e49bfd4f4f6d..addd6fdceec8 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -273,7 +273,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; struct bpf_map_memory mem; - u32 pages; int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) @@ -293,9 +292,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> - PAGE_SHIFT; - ret = bpf_map_charge_init(&mem, pages); + ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); if (ret < 0) return ERR_PTR(ret); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 6345a8d2dcd0..09334f13a8a0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -573,13 +573,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; cost += (u64) attr->max_entries * cost_per_node; - if (cost >= U32_MAX - PAGE_SIZE) { - ret = -E2BIG; - goto out_err; - } - ret = bpf_map_charge_init(&trie->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + ret = bpf_map_charge_init(&trie->map.memory, cost); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 224cb0fd8f03..f697647ceb54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -73,10 +73,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) size = (u64) attr->max_entries + 1; cost = queue_size = sizeof(*qs) + size * attr->value_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_charge_init(&mem, cost); if (ret < 0) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 5c6e25b1b9b1..50c083ba978c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -152,7 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; struct bpf_map_memory mem; - u64 cost, array_size; + u64 array_size; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -160,13 +160,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, array_size); if (err) return ERR_PTR(err); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8da24ca65d97..3d86072d8e32 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -117,14 +117,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - err = bpf_map_charge_init(&mem, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4a5ebad99154..4c53cbd3329d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -205,11 +205,16 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages) +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) { - struct user_struct *user = get_current_user(); + u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; + struct user_struct *user; int ret; + if (size >= U32_MAX - PAGE_SIZE) + return -E2BIG; + + user = get_current_user(); ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index a329dab7c7a4..22066c28ba61 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,12 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_m; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_charge_init(&m->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&m->map.memory, cost); if (err) goto free_m; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 621a0b07ff11..f40e3d35fd9c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -626,7 +626,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) struct bpf_sk_storage_map *smap; unsigned int i; u32 nbuckets; - u32 pages; u64 cost; int ret; @@ -638,9 +637,8 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); nbuckets = 1U << smap->bucket_log; cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_charge_init(&smap->map.memory, pages); + ret = bpf_map_charge_init(&smap->map.memory, cost); if (ret < 0) { kfree(smap); return ERR_PTR(ret); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1028c922a149..52d4faeee18b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* Make sure page count doesn't overflow. */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_stab; - } - - err = bpf_map_charge_init(&stab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&stab->map.memory, cost); if (err) goto free_stab; -- cgit v1.2.3 From 6685699e4ef5e9903d5c8bc6c2e6e13b931c98e1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 4 Jun 2019 09:21:46 +0100 Subject: bpf: remove redundant assignment to err The variable err is assigned with the value -EINVAL that is never read and it is re-assigned a new value later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 2 +- kernel/bpf/xskmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5ae7cce5ef16..b58a33ca8a27 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -88,8 +88,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 22066c28ba61..413d75f4fc72 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -17,8 +17,8 @@ struct xsk_map { static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - int cpu, err = -EINVAL; struct xsk_map *m; + int cpu, err; u64 cost; if (!capable(CAP_NET_ADMIN)) -- cgit v1.2.3 From fada7fdc83c0bf8755956bff707c42b609223301 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 6 Jun 2019 13:59:40 -0700 Subject: bpf: Allow bpf_map_lookup_elem() on an xskmap Currently, the AF_XDP code uses a separate map in order to determine if an xsk is bound to a queue. Instead of doing this, have bpf_map_lookup_elem() return a xdp_sock. Rearrange some xdp_sock members to eliminate structure holes. Remove selftest - will be added back in later patch. Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++++ include/net/xdp_sock.h | 4 +-- include/uapi/linux/bpf.h | 4 +++ kernel/bpf/verifier.c | 26 ++++++++++++-- kernel/bpf/xskmap.c | 7 ++++ net/core/filter.c | 40 ++++++++++++++++++++++ .../selftests/bpf/verifier/prevent_map_lookup.c | 15 -------- 7 files changed, 85 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5a309e6a400..1fe137afa898 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,6 +280,7 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ + PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ }; /* The information passed from prog-specific *_is_valid_access @@ -727,6 +728,13 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d074b6d60f8a..ae0f368a62bb 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -58,11 +58,11 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; - struct xsk_queue *tx ____cacheline_aligned_in_smp; - struct list_head list; bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths * in the SKB destructor callback. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c6aef253173..ae0907d8c03a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3083,6 +3083,10 @@ struct bpf_sock_tuple { }; }; +struct bpf_xdp_sock { + __u32 queue_id; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c2cb5bd84ce..8d1786357a09 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -334,7 +334,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK; + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -406,6 +407,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", }; static char slot_type_char[] = { @@ -1363,6 +1365,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return true; default: return false; @@ -1843,6 +1846,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_TCP_SOCK: valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_XDP_SOCK: + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -2007,6 +2013,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_TCP_SOCK: pointer_desc = "tcp_sock "; break; + case PTR_TO_XDP_SOCK: + pointer_desc = "xdp_sock "; + break; default: break; } @@ -2905,10 +2914,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, * appear. */ case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; + case BPF_MAP_TYPE_XSKMAP: + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -3799,6 +3812,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -5038,6 +5052,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; + } else if (reg->map_ptr->map_type == + BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -6299,6 +6316,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6693,6 +6711,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return false; default: return true; @@ -7826,6 +7845,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_TCP_SOCK: convert_ctx_access = bpf_tcp_sock_convert_ctx_access; break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; default: continue; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 413d75f4fc72..ef7338cebd18 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -151,6 +151,12 @@ void __xsk_map_flush(struct bpf_map *map) } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } @@ -218,6 +224,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/net/core/filter.c b/net/core/filter.c index f2777dc0b624..a5e4ac7fcbe5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5680,6 +5680,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) return INET_ECN_set_ce(skb); } +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c index bbdba990fefb..da7a4b37cb98 100644 --- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c +++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c @@ -28,21 +28,6 @@ .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", .prog_type = BPF_PROG_TYPE_SOCK_OPS, }, -{ - "prevent map lookup in xskmap", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_xskmap = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_XDP, -}, { "prevent map lookup in stack trace", .insns = { -- cgit v1.2.3 From aee450cbe482a8c2f6fa5b05b178ef8b8ff107ca Mon Sep 17 00:00:00 2001 From: Valdis Klētnieks Date: Thu, 6 Jun 2019 22:39:27 -0400 Subject: bpf: silence warning messages in core Compiling kernel/bpf/core.c with W=1 causes a flood of warnings: kernel/bpf/core.c:1198:65: warning: initialized field overwritten [-Woverride-init] 1198 | #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true | ^~~~ kernel/bpf/core.c:1087:2: note: in expansion of macro 'BPF_INSN_3_TBL' 1087 | INSN_3(ALU, ADD, X), \ | ^~~~~~ kernel/bpf/core.c:1202:3: note: in expansion of macro 'BPF_INSN_MAP' 1202 | BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), | ^~~~~~~~~~~~ kernel/bpf/core.c:1198:65: note: (near initialization for 'public_insntable[12]') 1198 | #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true | ^~~~ kernel/bpf/core.c:1087:2: note: in expansion of macro 'BPF_INSN_3_TBL' 1087 | INSN_3(ALU, ADD, X), \ | ^~~~~~ kernel/bpf/core.c:1202:3: note: in expansion of macro 'BPF_INSN_MAP' 1202 | BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), | ^~~~~~~~~~~~ 98 copies of the above. The attached patch silences the warnings, because we *know* we're overwriting the default initializer. That leaves bpf/core.c with only 6 other warnings, which become more visible in comparison. Signed-off-by: Valdis Kletnieks Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4c2fa3ac56f6..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o +CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -- cgit v1.2.3 From f7cf25b2026dc8441e0fa3a202c2aa8a56211e30 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:17 -0700 Subject: bpf: track spill/fill of constants Compilers often spill induction variables into the stack, hence it is necessary for the verifier to track scalar values of the registers through stack slots. Also few bpf programs were incorrectly rejected in the past, since the verifier was not able to track such constants while they were used to compute offsets into packet headers. Tracking constants through the stack significantly decreases the chances of state pruning, since two different constants are considered to be different by state equivalency. End result that cilium tests suffer serious degradation in the number of states processed and corresponding verification time increase. before after bpf_lb-DLB_L3.o 1838 6441 bpf_lb-DLB_L4.o 3218 5908 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26935 93790 bpf_lxc-DUNKNOWN.o 34439 123886 bpf_netdev.o 9721 31413 bpf_overlay.o 6184 18561 bpf_lxc_jit.o 39389 359445 After further debugging turned out that cillium progs are getting hurt by clang due to the same constant tracking issue. Newer clang generates better code by spilling less to the stack. Instead it keeps more constants in the registers which hurts state pruning since the verifier already tracks constants in the registers: old clang new clang (no spill/fill tracking introduced by this patch) bpf_lb-DLB_L3.o 1838 1923 bpf_lb-DLB_L4.o 3218 3077 bpf_lb-DUNKNOWN.o 1064 1062 bpf_lxc-DDROP_ALL.o 26935 166729 bpf_lxc-DUNKNOWN.o 34439 174607 bpf_netdev.o 9721 8407 bpf_overlay.o 6184 5420 bpf_lcx_jit.o 39389 39389 The final table is depressing: old clang old clang new clang new clang const spill/fill const spill/fill bpf_lb-DLB_L3.o 1838 6441 1923 8128 bpf_lb-DLB_L4.o 3218 5908 3077 6707 bpf_lb-DUNKNOWN.o 1064 1064 1062 1062 bpf_lxc-DDROP_ALL.o 26935 93790 166729 380712 bpf_lxc-DUNKNOWN.o 34439 123886 174607 440652 bpf_netdev.o 9721 31413 8407 31904 bpf_overlay.o 6184 18561 5420 23569 bpf_lxc_jit.o 39389 359445 39389 359445 Tracking constants in the registers hurts state pruning already. Adding tracking of constants through stack hurts pruning even more. The later patch address this general constant tracking issue with coarse/precise logic. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 90 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d1786357a09..6c13d86569a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1378,6 +1378,23 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } +static bool register_is_const(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); +} + +static void save_register_state(struct bpf_func_state *state, + int spi, struct bpf_reg_state *reg) +{ + int i; + + state->stack[spi].spilled_ptr = *reg; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_SPILL; +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -1387,7 +1404,7 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - enum bpf_reg_type type; + struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), state->acquired_refs, true); @@ -1404,27 +1421,37 @@ static int check_stack_write(struct bpf_verifier_env *env, } cur = env->cur_state->frame[env->cur_state->curframe]; - if (value_regno >= 0 && - is_spillable_regtype((type = cur->regs[value_regno].type))) { + if (value_regno >= 0) + reg = &cur->regs[value_regno]; + if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + !register_is_null(reg) && env->allow_ptr_leaks) { + save_register_state(state, spi, reg); + } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { + verbose_linfo(env, insn_idx, "; "); verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && type == PTR_TO_STACK) { + if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - /* save register state */ - state->stack[spi].spilled_ptr = cur->regs[value_regno]; - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + if (!env->allow_ptr_leaks) { + bool sanitize = false; - for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_MISC && - !env->allow_ptr_leaks) { + if (state->stack[spi].slot_type[0] == STACK_SPILL && + register_is_const(&state->stack[spi].spilled_ptr)) + sanitize = true; + for (i = 0; i < BPF_REG_SIZE; i++) + if (state->stack[spi].slot_type[i] == STACK_MISC) { + sanitize = true; + break; + } + if (sanitize) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -1447,8 +1474,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack[spi].slot_type[i] = STACK_SPILL; } + save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -1471,8 +1498,7 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (value_regno >= 0 && - register_is_null(&cur->regs[value_regno])) + if (reg && register_is_null(reg)) type = STACK_ZERO; /* Mark slots affected by this stack write. */ @@ -1490,6 +1516,7 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + struct bpf_reg_state *reg; u8 *stype; if (reg_state->allocated_stack <= slot) { @@ -1498,11 +1525,21 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } stype = reg_state->stack[spi].slot_type; + reg = ®_state->stack[spi].spilled_ptr; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); - return -EACCES; + if (reg->type != SCALAR_VALUE) { + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "invalid size of register fill\n"); + return -EACCES; + } + if (value_regno >= 0) { + mark_reg_unknown(env, state->regs, value_regno); + state->regs[value_regno].live |= REG_LIVE_WRITTEN; + } + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + return 0; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { @@ -1513,17 +1550,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + state->regs[value_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); - return 0; + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { int zeros = 0; @@ -1538,9 +1572,7 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1553,8 +1585,8 @@ static int check_stack_read(struct bpf_verifier_env *env, } state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - return 0; } + return 0; } static int check_stack_access(struct bpf_verifier_env *env, @@ -2415,7 +2447,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, slot, spi; + int err, min_off, max_off, i, j, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2503,6 +2535,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + __mark_reg_unknown(&state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + goto mark; + } + err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid indirect read from stack off %d+%d size %d\n", -- cgit v1.2.3 From fb8d251ee2a6bf4d7f4af5548e9c8f4fb5f90402 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:19 -0700 Subject: bpf: extend is_branch_taken to registers This patch extends is_branch_taken() logic from JMP+K instructions to JMP+X instructions. Conditional branches are often done when src and dst registers contain known scalars. In such case the verifier can follow the branch that is going to be taken when program executes. That speeds up the verification and is essential feature to support bounded loops. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c13d86569a6..8d3a4ef1d969 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5266,9 +5266,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *this_branch = env->cur_state; struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; - struct bpf_reg_state *dst_reg, *other_branch_regs; + struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); bool is_jmp32; + int pred = -1; int err; /* Only conditional jumps are expected to reach here. */ @@ -5293,6 +5294,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -5308,20 +5310,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode, - is_jmp32); - - if (pred == 1) { - /* only follow the goto, ignore fall-through */ - *insn_idx += insn->off; - return 0; - } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go - */ - return 0; - } + if (BPF_SRC(insn->code) == BPF_K) + pred = is_branch_taken(dst_reg, insn->imm, + opcode, is_jmp32); + else if (src_reg->type == SCALAR_VALUE && + tnum_is_const(src_reg->var_off)) + pred = is_branch_taken(dst_reg, src_reg->var_off.value, + opcode, is_jmp32); + if (pred == 1) { + /* only follow the goto, ignore fall-through */ + *insn_idx += insn->off; + return 0; + } else if (pred == 0) { + /* only follow fall-through branch, since + * that's where the program will go + */ + return 0; } other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, -- cgit v1.2.3 From 2589726d12a1b12eaaa93c7f1ea64287e383c7a5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:20 -0700 Subject: bpf: introduce bounded loops Allow the verifier to validate the loops by simulating their execution. Exisiting programs have used '#pragma unroll' to unroll the loops by the compiler. Instead let the verifier simulate all iterations of the loop. In order to do that introduce parentage chain of bpf_verifier_state and 'branches' counter for the number of branches left to explore. See more detailed algorithm description in bpf_verifier.h This algorithm borrows the key idea from Edward Cree approach: https://patchwork.ozlabs.org/patch/877222/ Additional state pruning heuristics make such brute force loop walk practical even for large loops. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 51 ++++++++++++++- kernel/bpf/verifier.c | 143 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 181 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 704ed7971472..03037373b447 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -194,6 +194,53 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; + struct bpf_verifier_state *parent; + /* + * 'branches' field is the number of branches left to explore: + * 0 - all possible paths from this state reached bpf_exit or + * were safely pruned + * 1 - at least one path is being explored. + * This state hasn't reached bpf_exit + * 2 - at least two paths are being explored. + * This state is an immediate parent of two children. + * One is fallthrough branch with branches==1 and another + * state is pushed into stack (to be explored later) also with + * branches==1. The parent of this state has branches==1. + * The verifier state tree connected via 'parent' pointer looks like: + * 1 + * 1 + * 2 -> 1 (first 'if' pushed into stack) + * 1 + * 2 -> 1 (second 'if' pushed into stack) + * 1 + * 1 + * 1 bpf_exit. + * + * Once do_check() reaches bpf_exit, it calls update_branch_counts() + * and the verifier state tree will look: + * 1 + * 1 + * 2 -> 1 (first 'if' pushed into stack) + * 1 + * 1 -> 1 (second 'if' pushed into stack) + * 0 + * 0 + * 0 bpf_exit. + * After pop_stack() the do_check() will resume at second 'if'. + * + * If is_state_visited() sees a state with branches > 0 it means + * there is a loop. If such state is exactly equal to the current state + * it's an infinite loop. Note states_equal() checks for states + * equvalency, so two states being 'states_equal' does not mean + * infinite loop. The exact comparison is provided by + * states_maybe_looping() function. It's a stronger pre-check and + * much faster than states_equal(). + * + * This algorithm may not find all possible infinite loops or + * loop iteration count may be too high. + * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. + */ + u32 branches; u32 insn_idx; u32 curframe; u32 active_spin_lock; @@ -312,7 +359,9 @@ struct bpf_verifier_env { } cfg; u32 subprog_cnt; /* number of instructions analyzed by the verifier */ - u32 insn_processed; + u32 prev_insn_processed, insn_processed; + /* number of jmps, calls, exits analyzed so far */ + u32 prev_jmps_processed, jmps_processed; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d3a4ef1d969..25baa3c8cdd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -721,6 +721,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; dst_state->active_spin_lock = src->active_spin_lock; + dst_state->branches = src->branches; + dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -736,6 +738,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + while (st) { + u32 br = --st->branches; + + /* WARN_ON(br > 1) technically makes sense here, + * but see comment in push_stack(), hence: + */ + WARN_ONCE((int)br < 0, + "BUG update_branch_counts:branches_to_explore=%d\n", + br); + if (br) + break; + st = st->parent; + } +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -789,6 +808,18 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size); goto err; } + if (elem->st.parent) { + ++elem->st.parent->branches; + /* WARN_ON(branches > 2) technically makes sense here, + * but + * 1. speculative states will bump 'branches' for non-branch + * instructions + * 2. is_state_visited() heuristics may decide not to create + * a new state for a sequence of branches and all such current + * and cloned states will be pointing to a single parent state + * which might have large 'branches' count. + */ + } return &elem->st; err: free_verifier_state(env->cur_state, true); @@ -5682,7 +5713,8 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx) * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + bool loop_ok) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -5712,6 +5744,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + if (loop_ok && env->allow_ptr_leaks) + return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -5763,7 +5797,7 @@ peek_stack: if (opcode == BPF_EXIT) { goto mark_explored; } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5772,7 +5806,8 @@ peek_stack: init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5785,7 +5820,7 @@ peek_stack: } /* unconditional jump with single edge */ ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env); + FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5798,13 +5833,13 @@ peek_stack: } else { /* conditional jump with two edges */ init_explored_state(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5814,7 +5849,7 @@ peek_stack: /* all other non-branch instructions with single * fall-through edge */ - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -6247,6 +6282,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { + if (sl->state.branches) + goto next; if (sl->state.insn_idx != insn || sl->state.curframe != cur->curframe) goto next; @@ -6611,12 +6648,32 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +static bool states_maybe_looping(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr = cur->curframe; + + if (old->curframe != fr) + return false; + + fold = old->frame[fr]; + fcur = cur->frame[fr]; + for (i = 0; i < MAX_BPF_REG; i++) + if (memcmp(&fold->regs[i], &fcur->regs[i], + offsetof(struct bpf_reg_state, parent))) + return false; + return true; +} + + static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; + bool add_new_state = false; if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not @@ -6624,6 +6681,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 + * Do not add new state for future pruning if the verifier hasn't seen + * at least 2 jumps and at least 8 instructions. + * This heuristics helps decrease 'total_states' and 'peak_states' metric. + * In tests that amounts to up to 50% reduction into total verifier + * memory consumption and 20% verifier time speedup. + */ + if (env->jmps_processed - env->prev_jmps_processed >= 2 && + env->insn_processed - env->prev_insn_processed >= 8) + add_new_state = true; + pprev = explored_state(env, insn_idx); sl = *pprev; @@ -6633,6 +6702,30 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; if (sl->state.insn_idx != insn_idx) goto next; + if (sl->state.branches) { + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { + verbose_linfo(env, insn_idx, "; "); + verbose(env, "infinite loop detected at insn %d\n", insn_idx); + return -EINVAL; + } + /* if the verifier is processing a loop, avoid adding new state + * too often, since different loop iterations have distinct + * states and may not help future pruning. + * This threshold shouldn't be too low to make sure that + * a loop with large bound will be rejected quickly. + * The most abusive loop will be: + * r1 += 1 + * if r1 < 1000000 goto pc-2 + * 1M insn_procssed limit / 100 == 10k peak states. + * This threshold shouldn't be too high either, since states + * at the end of the loop are likely to be useful in pruning. + */ + if (env->jmps_processed - env->prev_jmps_processed < 20 && + env->insn_processed - env->prev_insn_processed < 100) + add_new_state = false; + goto miss; + } if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6650,7 +6743,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl->miss_cnt++; +miss: + /* when new state is not going to be added do not increase miss count. + * Otherwise several loop iterations will remove the state + * recorded earlier. The goal of these heuristics is to have + * states from some iterations of the loop (some in the beginning + * and some at the end) to help pruning. + */ + if (add_new_state) + sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, @@ -6662,6 +6763,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ *pprev = sl->next; if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + u32 br = sl->state.branches; + + WARN_ONCE(br, + "BUG live_done but branches_to_explore %d\n", + br); free_verifier_state(&sl->state, false); kfree(sl); env->peak_states--; @@ -6687,18 +6793,25 @@ next: if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; - /* there were no equivalent states, remember current one. - * technically the current state is not proven to be safe yet, + if (!add_new_state) + return 0; + + /* There were no equivalent states, remember the current one. + * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. Since there are no loops, we won't be + * or it will be rejected. When there are no loops the verifier won't be * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit + * again on the way to bpf_exit. + * When looping the sl->state.branches will be > 0 and this state + * will not be considered for equivalence until branches == 0. */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; env->total_states++; env->peak_states++; + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6709,6 +6822,9 @@ next: return err; } new->insn_idx = insn_idx; + WARN_ONCE(new->branches != 1, + "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + cur->parent = new; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -6795,6 +6911,7 @@ static int do_check(struct bpf_verifier_env *env) return -ENOMEM; state->curframe = 0; state->speculative = false; + state->branches = 1; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -7001,6 +7118,7 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); + env->jmps_processed++; if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || @@ -7086,6 +7204,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: + update_branch_counts(env, env->cur_state); err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); if (err < 0) { -- cgit v1.2.3 From eea1c227b9e9bad295e8ef984004a9acf12bb68c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:21 -0700 Subject: bpf: fix callees pruning callers The commit 7640ead93924 partially resolved the issue of callees incorrectly pruning the callers. With introduction of bounded loops and jmps_processed heuristic single verifier state may contain multiple branches and calls. It's possible that new verifier state (for future pruning) will be allocated inside callee. Then callee will exit (still within the same verifier state). It will go back to the caller and there R6-R9 registers will be read and will trigger mark_reg_read. But the reg->live for all frames but the top frame is not set to LIVE_NONE. Hence mark_reg_read will fail to propagate liveness into parent and future walking will incorrectly conclude that the states are equivalent because LIVE_READ is not set. In other words the rule for parent/live should be: whenever register parentage chain is set the reg->live should be set to LIVE_NONE. is_state_visited logic already follows this rule for spilled registers. Fixes: 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25baa3c8cdd2..870c8f19ce80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6834,17 +6834,18 @@ next: * the state of the call instruction (with WRITTEN set), and r0 comes * from callee with its full parentage chain, anyway. */ - for (j = 0; j <= cur->curframe; j++) - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark * their parent and current state never has children yet. Only * explored_states can get read marks.) */ - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + for (j = 0; j <= cur->curframe; j++) { + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; + for (i = 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].live = REG_LIVE_NONE; + } /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { -- cgit v1.2.3 From b5dc0163d8fd78e64a7e21f309cf932fda34353e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:25 -0700 Subject: bpf: precise scalar_value tracking Introduce precision tracking logic that helps cilium programs the most: old clang old clang new clang new clang with all patches with all patches bpf_lb-DLB_L3.o 1838 2283 1923 1863 bpf_lb-DLB_L4.o 3218 2657 3077 2468 bpf_lb-DUNKNOWN.o 1064 545 1062 544 bpf_lxc-DDROP_ALL.o 26935 23045 166729 22629 bpf_lxc-DUNKNOWN.o 34439 35240 174607 28805 bpf_netdev.o 9721 8753 8407 6801 bpf_overlay.o 6184 7901 5420 4754 bpf_lxc_jit.o 39389 50925 39389 50925 Consider code: 654: (85) call bpf_get_hash_recalc#34 655: (bf) r7 = r0 656: (15) if r8 == 0x0 goto pc+29 657: (bf) r2 = r10 658: (07) r2 += -48 659: (18) r1 = 0xffff8881e41e1b00 661: (85) call bpf_map_lookup_elem#1 662: (15) if r0 == 0x0 goto pc+23 663: (69) r1 = *(u16 *)(r0 +0) 664: (15) if r1 == 0x0 goto pc+21 665: (bf) r8 = r7 666: (57) r8 &= 65535 667: (bf) r2 = r8 668: (3f) r2 /= r1 669: (2f) r2 *= r1 670: (bf) r1 = r8 671: (1f) r1 -= r2 672: (57) r1 &= 255 673: (25) if r1 > 0x1e goto pc+12 R0=map_value(id=0,off=0,ks=20,vs=64,imm=0) R1_w=inv(id=0,umax_value=30,var_off=(0x0; 0x1f)) 674: (67) r1 <<= 1 675: (0f) r0 += r1 At this point the verifier will notice that scalar R1 is used in map pointer adjustment. R1 has to be precise for later operations on R0 to be validated properly. The verifier will backtrack the above code in the following way: last_idx 675 first_idx 664 regs=2 stack=0 before 675: (0f) r0 += r1 // started backtracking R1 regs=2 is a bitmask regs=2 stack=0 before 674: (67) r1 <<= 1 regs=2 stack=0 before 673: (25) if r1 > 0x1e goto pc+12 regs=2 stack=0 before 672: (57) r1 &= 255 regs=2 stack=0 before 671: (1f) r1 -= r2 // now both R1 and R2 has to be precise -> regs=6 mask regs=6 stack=0 before 670: (bf) r1 = r8 // after this insn R8 and R2 has to be precise regs=104 stack=0 before 669: (2f) r2 *= r1 // after this one R8, R2, and R1 regs=106 stack=0 before 668: (3f) r2 /= r1 regs=106 stack=0 before 667: (bf) r2 = r8 regs=102 stack=0 before 666: (57) r8 &= 65535 regs=102 stack=0 before 665: (bf) r8 = r7 regs=82 stack=0 before 664: (15) if r1 == 0x0 goto pc+21 // this is the end of verifier state. The following regs will be marked precised: R1_rw=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R7_rw=invP(id=0) parent didn't have regs=82 stack=0 marks // so backtracking continues into parent state last_idx 663 first_idx 655 regs=82 stack=0 before 663: (69) r1 = *(u16 *)(r0 +0) // R1 was assigned no need to track it further regs=80 stack=0 before 662: (15) if r0 == 0x0 goto pc+23 // keep tracking R7 regs=80 stack=0 before 661: (85) call bpf_map_lookup_elem#1 // keep tracking R7 regs=80 stack=0 before 659: (18) r1 = 0xffff8881e41e1b00 regs=80 stack=0 before 658: (07) r2 += -48 regs=80 stack=0 before 657: (bf) r2 = r10 regs=80 stack=0 before 656: (15) if r8 == 0x0 goto pc+29 regs=80 stack=0 before 655: (bf) r7 = r0 // here the assignment into R7 // mark R0 to be precise: R0_rw=invP(id=0) parent didn't have regs=1 stack=0 marks // regs=1 -> tracking R0 last_idx 654 first_idx 644 regs=1 stack=0 before 654: (85) call bpf_get_hash_recalc#34 // and in the parent frame it was a return value // nothing further to backtrack Two scalar registers not marked precise are equivalent from state pruning point of view. More details in the patch comments. It doesn't support bpf2bpf calls yet and enabled for root only. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 18 ++ kernel/bpf/verifier.c | 491 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 498 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 03037373b447..19393b0964a8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -139,6 +139,8 @@ struct bpf_reg_state { */ s32 subreg_def; enum bpf_reg_liveness live; + /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ + bool precise; }; enum bpf_stack_slot_type { @@ -190,6 +192,11 @@ struct bpf_func_state { struct bpf_stack_state *stack; }; +struct bpf_idx_pair { + u32 prev_idx; + u32 idx; +}; + #define MAX_CALL_FRAMES 8 struct bpf_verifier_state { /* call stack tracking */ @@ -245,6 +252,17 @@ struct bpf_verifier_state { u32 curframe; u32 active_spin_lock; bool speculative; + + /* first and last insn idx of this verifier state */ + u32 first_insn_idx; + u32 last_insn_idx; + /* jmp history recorded from first to last. + * backtracking is using it to go from last to first. + * For most states jmp_history_cnt is [0-3]. + * For loops can go up to ~40. + */ + struct bpf_idx_pair *jmp_history; + u32 jmp_history_cnt; }; #define bpf_get_spilled_reg(slot, frame) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 870c8f19ce80..709ce4cef8ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -455,12 +455,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); - if (t == PTR_TO_STACK) - verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -522,11 +522,17 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, "=%s", - reg_type_str[state->stack[i].spilled_ptr.type]); - else + if (state->stack[i].slot_type[0] == STACK_SPILL) { + reg = &state->stack[i].spilled_ptr; + t = reg->type; + verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { verbose(env, "=%s", types_buf); + } } if (state->acquired_refs && state->refs[0].id) { verbose(env, " refs=%d", state->refs[0].id); @@ -675,6 +681,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -684,6 +697,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } + clear_jmp_history(state); if (free_self) kfree(state); } @@ -711,8 +725,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, const struct bpf_verifier_state *src) { struct bpf_func_state *dst; + u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; int i, err; + if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { + kfree(dst_state->jmp_history); + dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + } + memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); @@ -723,6 +747,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->active_spin_lock = src->active_spin_lock; dst_state->branches = src->branches; dst_state->parent = src->parent; + dst_state->first_insn_idx = src->first_insn_idx; + dst_state->last_insn_idx = src->last_insn_idx; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -967,6 +993,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; + + /* constant backtracking is enabled for root only for now */ + reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1378,6 +1407,389 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +/* for any branch, call, exit record the history of jmps in the given state */ +static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur) +{ + u32 cnt = cur->jmp_history_cnt; + struct bpf_idx_pair *p; + + cnt++; + p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + if (!p) + return -ENOMEM; + p[cnt - 1].idx = env->insn_idx; + p[cnt - 1].prev_idx = env->prev_insn_idx; + cur->jmp_history = p; + cur->jmp_history_cnt = cnt; + return 0; +} + +/* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. + */ +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) +{ + u32 cnt = *history; + + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; + } else { + i--; + } + return i; +} + +/* For given verifier state backtrack_insn() is called from the last insn to + * the first insn. Its purpose is to compute a bitmask of registers and + * stack slots that needs precision in the parent verifier state. + */ +static int backtrack_insn(struct bpf_verifier_env *env, int idx, + u32 *reg_mask, u64 *stack_mask) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + .private_data = env, + }; + struct bpf_insn *insn = env->prog->insnsi + idx; + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u32 dreg = 1u << insn->dst_reg; + u32 sreg = 1u << insn->src_reg; + u32 spi; + + if (insn->code == 0) + return 0; + if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + verbose(env, "%d: ", idx); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + if (!(*reg_mask & dreg)) + return 0; + if (opcode == BPF_MOV) { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg = sreg + * dreg needs precision after this insn + * sreg needs precision before this insn + */ + *reg_mask &= ~dreg; + *reg_mask |= sreg; + } else { + /* dreg = K + * dreg needs precision after this insn. + * Corresponding register is already marked + * as precise=true in this verifier state. + * No further markings in parent are necessary + */ + *reg_mask &= ~dreg; + } + } else { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg += sreg + * both dreg and sreg need precision + * before this insn + */ + *reg_mask |= sreg; + } /* else dreg += K + * dreg still needs precision before this insn + */ + } + } else if (class == BPF_LDX) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + + /* scalars can only be spilled into stack w/o losing precision. + * Load from any other memory can be zero extended. + * The desire to keep that precision is already indicated + * by 'precise' mark in corresponding register of this state. + * No further tracking necessary. + */ + if (insn->src_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + + /* dreg = *(u64 *)[fp - off] was a fill from the stack. + * that [fp - off] slot contains scalar that needs to be + * tracked with precision + */ + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + *stack_mask |= 1ull << spi; + } else if (class == BPF_STX) { + if (*reg_mask & dreg) + /* stx shouldn't be using _scalar_ dst_reg + * to access memory. It means backtracking + * encountered a case of pointer subtraction. + */ + return -ENOTSUPP; + /* scalars can only be spilled into stack */ + if (insn->dst_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (!(*stack_mask & (1ull << spi))) + return 0; + *stack_mask &= ~(1ull << spi); + *reg_mask |= sreg; + } else if (class == BPF_JMP || class == BPF_JMP32) { + if (opcode == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + return -ENOTSUPP; + /* regular helper call sets R0 */ + *reg_mask &= ~1; + if (*reg_mask & 0x3f) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", *reg_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } else if (opcode == BPF_EXIT) { + return -ENOTSUPP; + } + } else if (class == BPF_LD) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + /* It's ld_imm64 or ld_abs or ld_ind. + * For ld_imm64 no further tracking of precision + * into parent is necessary + */ + if (mode == BPF_IND || mode == BPF_ABS) + /* to be analyzed */ + return -ENOTSUPP; + } else if (class == BPF_ST) { + if (*reg_mask & dreg) + /* likely pointer subtraction */ + return -ENOTSUPP; + } + return 0; +} + +/* the scalar precision tracking algorithm: + * . at the start all registers have precise=false. + * . scalar ranges are tracked as normal through alu and jmp insns. + * . once precise value of the scalar register is used in: + * . ptr + scalar alu + * . if (scalar cond K|scalar) + * . helper_call(.., scalar, ...) where ARG_CONST is expected + * backtrack through the verifier states and mark all registers and + * stack slots with spilled constants that these scalar regisers + * should be precise. + * . during state pruning two registers (or spilled stack slots) + * are equivalent if both are not precise. + * + * Note the verifier cannot simply walk register parentage chain, + * since many different registers and stack slots could have been + * used to compute single precise scalar. + * + * The approach of starting with precise=true for all registers and then + * backtrack to mark a register as not precise when the verifier detects + * that program doesn't care about specific value (e.g., when helper + * takes register as ARG_ANYTHING parameter) is not safe. + * + * It's ok to walk single parentage chain of the verifier states. + * It's possible that this backtracking will go all the way till 1st insn. + * All other branches will be explored for needing precision later. + * + * The backtracking needs to deal with cases like: + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) + * r9 -= r8 + * r5 = r9 + * if r5 > 0x79f goto pc+7 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) + * r5 += 1 + * ... + * call bpf_perf_event_output#25 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO + * + * and this case: + * r6 = 1 + * call foo // uses callee's r6 inside to compute r0 + * r0 += r6 + * if r0 == 0 goto + * + * to track above reg_mask/stack_mask needs to be independent for each frame. + * + * Also if parent's curframe > frame where backtracking started, + * the verifier need to mark registers in both frames, otherwise callees + * may incorrectly prune callers. This is similar to + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") + * + * For now backtracking falls back into conservative marking. + */ +static void mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. + */ + for (; st; st = st->parent) + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (func->stack[j].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + } +} + +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + struct bpf_verifier_state *st = env->cur_state; + int first_idx = st->first_insn_idx; + int last_idx = env->insn_idx; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + u32 reg_mask = 1u << regno; + u64 stack_mask = 0; + bool skip_first = true; + int i, err; + + if (!env->allow_ptr_leaks) + /* backtracking is root only for now */ + return 0; + + func = st->frame[st->curframe]; + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (reg->precise) + return 0; + func->regs[regno].precise = true; + + for (;;) { + DECLARE_BITMAP(mask, 64); + bool new_marks = false; + u32 history = st->jmp_history_cnt; + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + for (i = last_idx;;) { + if (skip_first) { + err = 0; + skip_first = false; + } else { + err = backtrack_insn(env, i, ®_mask, &stack_mask); + } + if (err == -ENOTSUPP) { + mark_all_scalars_precise(env, st); + return 0; + } else if (err) { + return err; + } + if (!reg_mask && !stack_mask) + /* Found assignment(s) into tracked register in this state. + * Since this state is already marked, just return. + * Nothing to be tracked further in the parent state. + */ + return 0; + if (i == first_idx) + break; + i = get_prev_insn_idx(st, i, &history); + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask + * to backtrack. + * It means the backtracking missed the spot where + * particular register was initialized with a constant. + */ + verbose(env, "BUG backtracking idx %d\n", i); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } + st = st->parent; + if (!st) + break; + + func = st->frame[st->curframe]; + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* This can happen if backtracking + * is propagating stack precision where + * caller has larger stack frame + * than callee, but backtrack_insn() should + * have returned -ENOTSUPP. + */ + verbose(env, "BUG spi %d stack_size %d\n", + i, func->allocated_stack); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + if (func->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[i].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + if (env->log.level & BPF_LOG_LEVEL) { + print_verifier_state(env, func); + verbose(env, "parent %s regs=%x stack=%llx marks\n", + new_marks ? "didn't have" : "already had", + reg_mask, stack_mask); + } + + if (!new_marks) + break; + + last_idx = st->last_insn_idx; + first_idx = st->first_insn_idx; + } + return 0; +} + + static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -1435,6 +1847,7 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), @@ -1457,6 +1870,17 @@ static int check_stack_write(struct bpf_verifier_env *env, if (reg && size == BPF_REG_SIZE && register_is_const(reg) && !register_is_null(reg) && env->allow_ptr_leaks) { + if (dst_reg != BPF_REG_FP) { + /* The backtracking logic can only recognize explicit + * stack slot address like [fp - 8]. Other spill of + * scalar via different register has to be conervative. + * Backtrack from here and mark all registers as precise + * that contributed into 'reg' being a constant. + */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } save_register_state(state, spi, reg); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ @@ -1529,8 +1953,13 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (reg && register_is_null(reg)) + if (reg && register_is_null(reg)) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; type = STACK_ZERO; + } /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) @@ -1610,6 +2039,17 @@ static int check_stack_read(struct bpf_verifier_env *env, * so the whole register == const_zero */ __mark_reg_const_zero(&state->regs[value_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[value_regno].precise = true; } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, value_regno); @@ -2925,6 +3365,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -4361,6 +4803,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); + int err; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -4387,11 +4830,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * This is legal, but we have to reverse our * src/dest handling in computing the range */ + err = mark_chain_precision(env, insn->dst_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); } @@ -5348,6 +5797,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, tnum_is_const(src_reg->var_off)) pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode, is_jmp32); + if (pred >= 0) { + err = mark_chain_precision(env, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X && !err) + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; + } if (pred == 1) { /* only follow the goto, ignore fall-through */ *insn_idx += insn->off; @@ -5825,6 +6281,11 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ @@ -6325,6 +6786,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, switch (rold->type) { case SCALAR_VALUE: if (rcur->type == SCALAR_VALUE) { + if (!rold->precise && !rcur->precise) + return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); @@ -6675,6 +7138,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) int i, j, err, states_cnt = 0; bool add_new_state = false; + cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6791,10 +7255,10 @@ next: env->max_states_per_insn = states_cnt; if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; + return push_jmp_history(env, cur); if (!add_new_state) - return 0; + return push_jmp_history(env, cur); /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, @@ -6824,7 +7288,10 @@ next: new->insn_idx = insn_idx; WARN_ONCE(new->branches != 1, "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + cur->parent = new; + cur->first_insn_idx = insn_idx; + clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -6904,6 +7371,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; + int prev_insn_idx = -1; env->prev_linfo = NULL; @@ -6929,6 +7397,7 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); @@ -7001,6 +7470,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); env->insn_aux_data[env->insn_idx].seen = true; + prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -7174,7 +7644,6 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - env->prev_insn_idx = env->insn_idx; err = prepare_func_exit(env, &env->insn_idx); if (err) return err; @@ -7206,7 +7675,7 @@ static int do_check(struct bpf_verifier_env *env) return err; process_bpf_exit: update_branch_counts(env, env->cur_state); - err = pop_stack(env, &env->prev_insn_idx, + err = pop_stack(env, &prev_insn_idx, &env->insn_idx); if (err < 0) { if (err != -ENOENT) -- cgit v1.2.3 From 6bf071bf09d4b2ff3ee8783531e2ce814f0870cb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 18 Jun 2019 15:05:27 +0200 Subject: xdp: page_pool related fix to cpumap When converting an xdp_frame into an SKB, and sending this into the network stack, then the underlying XDP memory model need to release associated resources, because the network stack don't have callbacks for XDP memory models. The only memory model that needs this is page_pool, when a driver use the DMA-mapping feature. Introduce page_pool_release_page(), which basically does the same as page_pool_unmap_page(). Add xdp_release_frame() as the XDP memory model interface for calling it, if the memory model match MEM_TYPE_PAGE_POOL, to save the function call overhead for others. Have cpumap call xdp_release_frame() before xdp_scrub_frame(). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 15 ++++++++++++++- include/net/xdp.h | 15 +++++++++++++++ kernel/bpf/cpumap.c | 3 +++ net/core/xdp.c | 15 +++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ad218cef88c5..e240fac4c5b9 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -110,7 +110,6 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) struct page_pool *page_pool_create(const struct page_pool_params *params); void page_pool_destroy(struct page_pool *pool); -void page_pool_unmap_page(struct page_pool *pool, struct page *page); /* Never call this directly, use helpers below */ void __page_pool_put_page(struct page_pool *pool, @@ -133,6 +132,20 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_unmap_page(struct page_pool *pool, struct page *page); +static inline void page_pool_release_page(struct page_pool *pool, + struct page *page) +{ +#ifdef CONFIG_PAGE_POOL + page_pool_unmap_page(pool, page); +#endif +} + static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { return page->dma_addr; diff --git a/include/net/xdp.h b/include/net/xdp.h index 8e0deddef35c..40c6d3398458 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -129,6 +129,21 @@ void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); +/* When sending xdp_frame into the network stack, then there is no + * return point callback, which is needed to release e.g. DMA-mapping + * resources with page_pool. Thus, have explicit function to release + * frame resources. + */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem); +static inline void xdp_release_frame(struct xdp_frame *xdpf) +{ + struct xdp_mem_info *mem = &xdpf->mem; + + /* Curr only page_pool needs this */ + if (mem->type == MEM_TYPE_PAGE_POOL) + __xdp_release_frame(xdpf->data, mem); +} + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8ee5532cf6a6..8dff08768087 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -208,6 +208,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + /* Allow SKB to reuse area used by xdp_frame */ xdp_scrub_frame(xdpf); diff --git a/net/core/xdp.c b/net/core/xdp.c index 1d5f2292962c..0fcc32340c4e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -381,6 +381,21 @@ void xdp_return_buff(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_return_buff); +/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_release_page(xa->page_pool, page); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(__xdp_release_frame); + int xdp_attachment_query(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { -- cgit v1.2.3 From e4f07120210a1794c1f1ae64d209a2fbc7bd2682 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 19 Jun 2019 12:01:05 -0700 Subject: bpf: fix NULL deref in btf_type_is_resolve_source_only Commit 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") added invocations of btf_type_is_resolve_source_only before btf_type_nosize_or_null which checks for the NULL pointer. Swap the order of btf_type_nosize_or_null and btf_type_is_resolve_source_only to make sure the do the NULL pointer check first. Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Reported-by: syzbot Signed-off-by: Stanislav Fomichev Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cad09858a5f2..546ebee39e2a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_resolve_source_only(index_type) || - btf_type_nosize_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type) || + btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_resolve_source_only(elem_type) || - btf_type_nosize_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type) || + btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_resolve_source_only(member_type) || - btf_type_nosize_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type) || + btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; -- cgit v1.2.3 From e7d4798960b3ebcd243ae6a59e04d4fe6518c96c Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 13 Jun 2019 18:39:58 +0900 Subject: xdp: Add tracepoint for bulk XDP_TX This is introduced for admins to check what is happening on XDP_TX when bulk XDP_TX is in use, which will be first introduced in veth in next commit. v3: - Add act field to be in line with other XDP tracepoints. Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/trace/events/xdp.h | 29 +++++++++++++++++++++++++++++ kernel/bpf/core.c | 1 + 2 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index bb5e380e2ef3..81e708c4b513 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -50,6 +50,35 @@ TRACE_EVENT(xdp_exception, __entry->ifindex) ); +TRACE_EVENT(xdp_bulk_tx, + + TP_PROTO(const struct net_device *dev, + int sent, int drops, int err), + + TP_ARGS(dev, sent, drops, err), + + TP_STRUCT__entry( + __field(int, ifindex) + __field(u32, act) + __field(int, drops) + __field(int, sent) + __field(int, err) + ), + + TP_fast_assign( + __entry->ifindex = dev->ifindex; + __entry->act = XDP_TX; + __entry->drops = drops; + __entry->sent = sent; + __entry->err = err; + ), + + TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", + __entry->ifindex, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, __entry->err) +); + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad3be85f1411..561ed07d3007 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2101,3 +2101,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); -- cgit v1.2.3 From 9db1ff0a415c7de8eb67df5b2c56ac409ccefc37 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Jun 2019 17:35:03 -0700 Subject: bpf: fix compiler warning with CONFIG_MODULES=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With CONFIG_MODULES=n, the following compiler warning occurs: /data/users/yhs/work/net-next/kernel/trace/bpf_trace.c:605:13: warning: ‘do_bpf_send_signal’ defined but not used [-Wunused-function] static void do_bpf_send_signal(struct irq_work *entry) The __init function send_signal_irq_work_init(), which calls do_bpf_send_signal(), is defined under CONFIG_MODULES. Hence, when CONFIG_MODULES=n, nobody calls static function do_bpf_send_signal(), hence the warning. The init function send_signal_irq_work_init() should work without CONFIG_MODULES. Moving it out of CONFIG_MODULES code section fixed the compiler warning, and also make bpf_send_signal() helper work without CONFIG_MODULES. Fixes: 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") Reported-By: Arnd Bergmann Signed-off-by: Yonghong Song Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c102c240bb0b..ca1255d14576 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1431,6 +1431,20 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + +subsys_initcall(send_signal_irq_work_init); + #ifdef CONFIG_MODULES static int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) @@ -1478,18 +1492,5 @@ static int __init bpf_event_init(void) return 0; } -static int __init send_signal_irq_work_init(void) -{ - int cpu; - struct send_signal_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&send_signal_work, cpu); - init_irq_work(&work->irq_work, do_bpf_send_signal); - } - return 0; -} - fs_initcall(bpf_event_init); -subsys_initcall(send_signal_irq_work_init); #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 75672dda27bd00109a84cd975c17949ad9c45663 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 25 Jun 2019 17:41:50 +0100 Subject: bpf: fix BPF_ALU32 | BPF_ARSH on BE arches Yauheni reported the following code do not work correctly on BE arches: ALU_ARSH_X: DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); CONT; ALU_ARSH_K: DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); CONT; and are causing failure of test_verifier test 'arsh32 on imm 2' on BE arches. The code is taking address and interpreting memory directly, so is not endianness neutral. We should instead perform standard C type casting on the variable. A u64 to s32 conversion will drop the high 32-bit and reserve the low 32-bit as signed integer, this is all we want. Fixes: 2dc6b100f928 ("bpf: interpreter support BPF_ALU | BPF_ARSH") Reported-by: Yauheni Kaliuta Reviewed-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Jiong Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 080e2bb644cc..f2148db91439 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1364,10 +1364,10 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> SRC); CONT; ALU_ARSH_K: - DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; -- cgit v1.2.3 From e5c891a349d7c556b7b9dc231d6dd78e88a29e5c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 25 Jun 2019 14:38:58 -0700 Subject: bpf: fix cgroup bpf release synchronization Since commit 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself"), cgroup_bpf release occurs asynchronously (from a worker context), and before the release of the cgroup itself. This introduced a previously non-existing race between the release and update paths. E.g. if a leaf's cgroup_bpf is released and a new bpf program is attached to the one of ancestor cgroups at the same time. The race may result in double-free and other memory corruptions. To fix the problem, let's protect the body of cgroup_bpf_release() with cgroup_mutex, as it was effectively previously, when all this code was called from the cgroup release path with cgroup mutex held. Also let's skip cgroups, which have no chances to invoke a bpf program, on the update path. If the cgroup bpf refcnt reached 0, it means that the cgroup is offline (no attached processes), and there are no associated sockets left. It means there is no point in updating effective progs array! And it can lead to a leak, if it happens after the release. So, let's skip such cgroups. Big thanks for Tejun Heo for discovering and debugging of this problem! Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Reported-by: Tejun Heo Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c225c42e114a..077ed3a19848 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -16,6 +16,8 @@ #include #include +#include "../cgroup/cgroup-internal.h" + DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); @@ -38,6 +40,8 @@ static void cgroup_bpf_release(struct work_struct *work) struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -54,10 +58,12 @@ static void cgroup_bpf_release(struct work_struct *work) } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], - percpu_ref_is_dying(&cgrp->bpf.refcnt)); + lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } + mutex_unlock(&cgroup_mutex); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -229,6 +235,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -238,6 +247,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } -- cgit v1.2.3 From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 45 ++++++ include/linux/bpf.h | 2 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 14 ++ kernel/bpf/cgroup.c | 333 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 9 ++ kernel/bpf/syscall.c | 19 +++ kernel/bpf/verifier.c | 8 ++ net/core/filter.c | 2 +- net/socket.c | 30 ++++ 11 files changed, 472 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index bd79ae32909a..169fd25f6bc2 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, loff_t *ppos, void **new_buf, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval); +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ + optname, optval, \ + optlen, \ + kernel_optval); \ + __ret; \ +}) + +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + get_user(__ret, optlen); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ + max_optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ + optname, optval, \ + optlen, max_optlen, \ + retval); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ + optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a62e7889b0b6..18f4cc2c6acd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -518,6 +518,7 @@ struct bpf_prog_array { struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); +bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); @@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; +extern const struct bpf_func_proto bpf_tcp_sock_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5a9975678d6f..eec5aeeeaf92 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 43b45d6db36d..340f7d648974 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1199,4 +1199,14 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +struct bpf_sockopt_kern { + struct sock *sk; + u8 *optval; + u8 *optval_end; + s32 level; + s32 optname; + s32 optlen; + s32 retval; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b077507efa3f..a396b516a2b2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,6 +170,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, }; enum bpf_attach_type { @@ -194,6 +195,8 @@ enum bpf_attach_type { BPF_CGROUP_SYSCTL, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, __MAX_BPF_ATTACH_TYPE }; @@ -3541,4 +3544,15 @@ struct bpf_sysctl { */ }; +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 077ed3a19848..76fa0076f20d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "../cgroup/cgroup-internal.h" @@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 561ed07d3007..e2c1b43728da 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array) return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; + + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7713cf39795a..b0f545e07425 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: return prog->enforce_expected_attach_type && @@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e079b2298f8..6b5623d320f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 2014d76e0d2a..dc8534be12fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5651,7 +5651,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) return (unsigned long)NULL; } -static const struct bpf_func_proto bpf_tcp_sock_proto = { +const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, diff --git a/net/socket.c b/net/socket.c index 963df5dbdd54..0ddfbfb761d9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } -- cgit v1.2.3 From c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: xskmap: Move non-standard list manipulation to helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper in list.h for the non-standard way of clearing a list that is used in xskmap. This makes it easier to reuse it in the other map types, and also makes sure this usage is not forgotten in any list refactorings in the future. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- include/linux/list.h | 14 ++++++++++++++ kernel/bpf/xskmap.c | 3 +-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/list.h b/include/linux/list.h index e951228db4b2..85c92555e31f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) WRITE_ONCE(prev->next, next); } +/* + * Delete a list entry and clear the 'prev' pointer. + * + * This is a special-purpose list clearing method used in the networking code + * for lists allocated as per-cpu, where we don't want to incur the extra + * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this + * needs to check the node 'prev' pointer instead of calling list_empty(). + */ +static inline void __list_del_clearprev(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = NULL; +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index ef7338cebd18..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } -- cgit v1.2.3 From d5df2830ca9922d03a33940ea424c9a5f39f1162 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: devmap/cpumap: Use flush list instead of bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The socket map uses a linked list instead of a bitmap to keep track of which entries to flush. Do the same for devmap and cpumap, as this means we don't have to care about the map index when enqueueing things into the map (and so we can cache the map lookup). Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 105 +++++++++++++++++++++++---------------------------- kernel/bpf/devmap.c | 107 +++++++++++++++++++++++----------------------------- net/core/filter.c | 2 - 3 files changed, 95 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8dff08768087..ef49e17ae47c 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); + /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), @@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 40e86a7e0ef0..a4dddc867cbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -17,9 +17,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -48,9 +47,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -65,23 +68,18 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err, cpu; u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -99,7 +97,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); @@ -108,28 +106,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -158,14 +158,14 @@ static void dev_map_free(struct bpf_map *map) rcu_barrier(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -181,7 +181,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -203,18 +203,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -241,6 +233,7 @@ out: trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -263,31 +256,18 @@ error: * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - - __clear_bit(bit, bitmap); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); rcu_read_unlock(); } @@ -314,10 +294,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -327,6 +308,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -377,17 +362,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } rcu_read_unlock(); } @@ -434,8 +414,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -458,6 +440,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/net/core/filter.c b/net/core/filter.c index dc8534be12fc..1e5fd37e9ab5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3523,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (unlikely(err)) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3532,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (unlikely(err)) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { -- cgit v1.2.3 From 0cdbb4b09a0658b72c563638d476113aadd91afb Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 28 Jun 2019 11:12:35 +0200 Subject: devmap: Allow map lookups from eBPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't currently allow lookups into a devmap from eBPF, because the map lookup returns a pointer directly to the dev->ifindex, which shouldn't be modifiable from eBPF. However, being able to do lookups in devmaps is useful to know (e.g.) whether forwarding to a specific interface is enabled. Currently, programs work around this by keeping a shadow map of another type which indicates whether a map index is valid. Since we now have a flag to make maps read-only from the eBPF side, we can simply lift the lookup restriction if we make sure this flag is always set. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 5 +++++ kernel/bpf/verifier.c | 7 ++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a4dddc867cbf..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -89,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b5623d320f9..9b6ee93d5a85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3414,12 +3414,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases -- cgit v1.2.3 From a3ce685dd01a786fa5bc388e47d0066a4f842591 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Jun 2019 09:24:09 -0700 Subject: bpf: fix precision tracking When equivalent state is found the current state needs to propagate precision marks. Otherwise the verifier will prune the search incorrectly. There is a price for correctness: before before broken fixed cnst spill precise precise bpf_lb-DLB_L3.o 1923 8128 1863 1898 bpf_lb-DLB_L4.o 3077 6707 2468 2666 bpf_lb-DUNKNOWN.o 1062 1062 544 544 bpf_lxc-DDROP_ALL.o 166729 380712 22629 36823 bpf_lxc-DUNKNOWN.o 174607 440652 28805 45325 bpf_netdev.o 8407 31904 6801 7002 bpf_overlay.o 5420 23569 4754 4858 bpf_lxc_jit.o 39389 359445 50925 69631 Overall precision tracking is still very effective. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Tested-by: Lawrence Brakmo Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 121 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b6ee93d5a85..a2e763703c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1659,16 +1659,18 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, } } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, + int spi) { struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; struct bpf_func_state *func; struct bpf_reg_state *reg; - u32 reg_mask = 1u << regno; - u64 stack_mask = 0; + u32 reg_mask = regno >= 0 ? 1u << regno : 0; + u64 stack_mask = spi >= 0 ? 1ull << spi : 0; bool skip_first = true; + bool new_marks = false; int i, err; if (!env->allow_ptr_leaks) @@ -1676,18 +1678,43 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; func = st->frame[st->curframe]; - reg = &func->regs[regno]; - if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); - return -EFAULT; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (!reg->precise) + new_marks = true; + else + reg_mask = 0; + reg->precise = true; } - if (reg->precise) - return 0; - func->regs[regno].precise = true; + while (spi >= 0) { + if (func->stack[spi].slot_type[0] != STACK_SPILL) { + stack_mask = 0; + break; + } + reg = &func->stack[spi].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask = 0; + break; + } + if (!reg->precise) + new_marks = true; + else + stack_mask = 0; + reg->precise = true; + break; + } + + if (!new_marks) + return 0; + if (!reg_mask && !stack_mask) + return 0; for (;;) { DECLARE_BITMAP(mask, 64); - bool new_marks = false; u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL) @@ -1730,12 +1757,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) if (!st) break; + new_marks = false; func = st->frame[st->curframe]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1756,11 +1786,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return -EFAULT; } - if (func->stack[i].slot_type[0] != STACK_SPILL) + if (func->stack[i].slot_type[0] != STACK_SPILL) { + stack_mask &= ~(1ull << i); continue; + } reg = &func->stack[i].spilled_ptr; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + stack_mask &= ~(1ull << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1772,6 +1806,8 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) reg_mask, stack_mask); } + if (!reg_mask && !stack_mask) + break; if (!new_marks) break; @@ -1781,6 +1817,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + return __mark_chain_precision(env, regno, -1); +} + +static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +{ + return __mark_chain_precision(env, -1, spi); +} static bool is_spillable_regtype(enum bpf_reg_type type) { @@ -7111,6 +7156,46 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0; + + state = old->frame[old->curframe]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating r%d\n", i); + err = mark_chain_precision(env, i); + if (err < 0) + return err; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE); + err = mark_chain_precision_stack(env, i); + if (err < 0) + return err; + } + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -7203,6 +7288,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * this state and will pop a new one. */ err = propagate_liveness(env, &sl->state, cur); + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precsion marks) + * the precision needs to be propagated back in + * the current state. + */ + err = err ? : push_jmp_history(env, cur); + err = err ? : propagate_precision(env, &sl->state); if (err) return err; return 1; -- cgit v1.2.3 From 6705fea0c799a4efb9a9ce2968a2f7a570e33dc2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 3 Jul 2019 16:26:30 +0800 Subject: bpf: cgroup: Fix build error without CONFIG_NET If CONFIG_NET is not set and CONFIG_CGROUP_BPF=y, gcc building fails: kernel/bpf/cgroup.o: In function `cg_sockopt_func_proto': cgroup.c:(.text+0x237e): undefined reference to `bpf_sk_storage_get_proto' cgroup.c:(.text+0x2394): undefined reference to `bpf_sk_storage_delete_proto' kernel/bpf/cgroup.o: In function `__cgroup_bpf_run_filter_getsockopt': (.text+0x2a1f): undefined reference to `lock_sock_nested' (.text+0x2ca2): undefined reference to `release_sock' kernel/bpf/cgroup.o: In function `__cgroup_bpf_run_filter_setsockopt': (.text+0x3006): undefined reference to `lock_sock_nested' (.text+0x32bb): undefined reference to `release_sock' Reported-by: Hulk Robot Suggested-by: Stanislav Fomichev Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Signed-off-by: YueHaibing Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 76fa0076f20d..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -939,6 +939,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +#ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, enum bpf_attach_type attach_type) { @@ -1120,6 +1121,7 @@ out: return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +#endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) @@ -1386,10 +1388,12 @@ static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { +#ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; -- cgit v1.2.3