From 59089a189e3adde4cf85f2ce479738d1ae4c514d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 29 Jun 2021 09:39:15 +0000 Subject: bpf: Remove superfluous aux sanitation on subprog rejection Follow-up to fe9a5ca7e370 ("bpf: Do not mark insn as seen under speculative path verification"). The sanitize_insn_aux_data() helper does not serve a particular purpose in today's code. The original intention for the helper was that if function-by-function verification fails, a given program would be cleared from temporary insn_aux_data[], and then its verification would be re-attempted in the context of the main program a second time. However, a failure in do_check_subprogs() will skip do_check_main() and propagate the error to the user instead, thus such situation can never occur. Given its interaction is not compatible to the Spectre v1 mitigation (due to comparing aux->seen with env->pass_cnt), just remove sanitize_insn_aux_data() to avoid future bugs in this area. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9de3c9c3267c..8a7a28b4cfb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12752,37 +12752,6 @@ static void free_states(struct bpf_verifier_env *env) } } -/* The verifier is using insn_aux_data[] to store temporary data during - * verification and to store information for passes that run after the - * verification like dead code sanitization. do_check_common() for subprogram N - * may analyze many other subprograms. sanitize_insn_aux_data() clears all - * temporary data after do_check_common() finds that subprogram N cannot be - * verified independently. pass_cnt counts the number of times - * do_check_common() was run and insn->aux->seen tells the pass number - * insn_aux_data was touched. These variables are compared to clear temporary - * data from failed pass. For testing and experiments do_check_common() can be - * run multiple times even when prior attempt to verify is unsuccessful. - * - * Note that special handling is needed on !env->bypass_spec_v1 if this is - * ever called outside of error path with subsequent program rejection. - */ -static void sanitize_insn_aux_data(struct bpf_verifier_env *env) -{ - struct bpf_insn *insn = env->prog->insnsi; - struct bpf_insn_aux_data *aux; - int i, class; - - for (i = 0; i < env->prog->len; i++) { - class = BPF_CLASS(insn[i].code); - if (class != BPF_LDX && class != BPF_STX) - continue; - aux = &env->insn_aux_data[i]; - if (aux->seen != env->pass_cnt) - continue; - memset(aux, 0, offsetof(typeof(*aux), orig_idx)); - } -} - static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); @@ -12859,9 +12828,6 @@ out: if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); - if (ret) - /* clean aux data in case subprog was rejected */ - sanitize_insn_aux_data(env); return ret; } -- cgit v1.2.3 From e042aa532c84d18ff13291d00620502ce7a38dda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 16 Jul 2021 09:18:21 +0000 Subject: bpf: Fix pointer arithmetic mask tightening under state pruning In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space. The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307. One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..7ba7e800d472 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -414,6 +414,7 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; bool allow_ptr_to_map_access; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a7a28b4cfb9..657062cb4d85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6561,6 +6561,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + /* Limit pruning on unknown scalars to enable deep search for + * potential masking differences from other program paths. + */ + if (!off_is_imm) + env->explore_alu_limits = true; } err = update_alu_sanitation_state(aux, alu_state, alu_limit); @@ -9936,8 +9942,8 @@ next: } /* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { bool equal; @@ -9963,6 +9969,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; switch (rold->type) { case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; if (rcur->type == SCALAR_VALUE) { if (!rold->precise && !rcur->precise) return true; @@ -10053,9 +10061,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, - struct bpf_id_pair *idmap) +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_id_pair *idmap) { int i, spi; @@ -10100,9 +10107,8 @@ static bool stacksafe(struct bpf_func_state *old, continue; if (old->stack[spi].slot_type[0] != STACK_SPILL) continue; - if (!regsafe(&old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, - idmap)) + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -10159,10 +10165,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + if (!regsafe(env, &old->regs[i], &cur->regs[i], + env->idmap_scratch)) return false; - if (!stacksafe(old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; if (!refsafe(old, cur)) -- cgit v1.2.3 From f5e81d1117501546b7be050c5fbafa6efd2c722c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 3 +++ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ arch/mips/net/ebpf_jit.c | 3 +++ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ arch/riscv/net/bpf_jit_comp32.c | 4 ++++ arch/riscv/net/bpf_jit_comp64.c | 4 ++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/sparc/net/bpf_jit_comp_64.c | 3 +++ arch/x86/net/bpf_jit_comp.c | 7 +++++++ arch/x86/net/bpf_jit_comp32.c | 6 ++++++ include/linux/filter.h | 15 +++++++++++++++ kernel/bpf/core.c | 19 ++++++++++++++++++- kernel/bpf/disasm.c | 16 +++++++++------- 14 files changed, 102 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 897634d0a67c..a951276f0547 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1602,6 +1602,9 @@ exit: rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index dccf98a37283..41c23f474ea6 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -823,6 +823,19 @@ emit_cond_jmp: return ret; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + /* + * Nothing required here. + * + * In case of arm64, we rely on the firmware mitigation of + * Speculative Store Bypass as controlled via the ssbd kernel + * parameter. Whenever the mitigation is enabled, it works + * for all of the kernel code with no need to provide any + * additional instructions. + */ + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 939dd06764bc..3a73e9375712 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1355,6 +1355,9 @@ jeq_common: } break; + case BPF_ST | BPF_NOSPEC: /* speculation barrier */ + break; + case BPF_ST | BPF_B | BPF_MEM: case BPF_ST | BPF_H | BPF_MEM: case BPF_ST | BPF_W | BPF_MEM: diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 34bb1583fc0c..beb12cbc8c29 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -737,6 +737,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index de8595880fee..b87a63dba9c8 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -627,6 +627,12 @@ emit_clear: } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 81de865f4c7c..e6497424cbf6 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_W: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 87e3bf5b9086..3af4131c22c7 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -939,6 +939,10 @@ out_be: emit_ld(rd, 0, RV_REG_T1, ctx); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: emit_imm(RV_REG_T1, imm, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 2ae419f5115a..88419263a89a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, break; } break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; /* * BPF_ST(X) */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 4b8d3c65d266..9a2f20cbd48b 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) return 1; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4b951458c9fc..16d76f814e9b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1219,6 +1219,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; + /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: if (is_ereg(dst_reg)) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3da88ded6ee3..3bfda5f502cb 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1886,6 +1886,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, i++; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..83b896044e79 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -73,6 +73,11 @@ struct ctl_table_header; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b1577498373..b1a5fc04492b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -32,6 +32,8 @@ #include #include #include + +#include #include /* Registers */ @@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, @@ -1621,7 +1624,21 @@ out: COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP - /* STX and ST and LDX*/ + /* ST, STX and LDX*/ + ST_NOSPEC: + /* Speculation barrier for mitigating Speculative Store Bypass. + * In case of arm64, we rely on the firmware mitigation as + * controlled via the ssbd kernel parameter. Whenever the + * mitigation is enabled, it works for all of the kernel code + * with no need to provide any additional instructions here. + * In case of x86, we use 'lfence' insn for mitigation. We + * reuse preexisting logic from Spectre v1 mitigation that + * happens to produce the required code on x86 for v4 as well. + */ +#ifdef CONFIG_X86 + barrier_nospec(); +#endif + CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bbfc6bb79240..ca3cd9aaa6ce 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); -- cgit v1.2.3 From 2039f26f3aca5b0e419b98f65dd36481337b86ee Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Fix leakage due to insufficient speculative store bypass mitigation Spectre v4 gadgets make use of memory disambiguation, which is a set of techniques that execute memory access instructions, that is, loads and stores, out of program order; Intel's optimization manual, section 2.4.4.5: A load instruction micro-op may depend on a preceding store. Many microarchitectures block loads until all preceding store addresses are known. The memory disambiguator predicts which loads will not depend on any previous stores. When the disambiguator predicts that a load does not have such a dependency, the load takes its data from the L1 data cache. Eventually, the prediction is verified. If an actual conflict is detected, the load and all succeeding instructions are re-executed. af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate this attack by sanitizing the memory locations through preemptive "fast" (low latency) stores of zero prior to the actual "slow" (high latency) store of a pointer value such that upon dependency misprediction the CPU then speculatively executes the load of the pointer value and retrieves the zero value instead of the attacker controlled scalar value previously stored at that location, meaning, subsequent access in the speculative domain is then redirected to the "zero page". The sanitized preemptive store of zero prior to the actual "slow" store is done through a simple ST instruction based on r10 (frame pointer) with relative offset to the stack location that the verifier has been tracking on the original used register for STX, which does not have to be r10. Thus, there are no memory dependencies for this store, since it's only using r10 and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency operation. However, a recent attack demonstrated that this mitigation is not sufficient since the preemptive store of zero could also be turned into a "slow" store and is thus bypassed as well: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value 31: (7b) *(u64 *)(r10 -16) = r2 // r9 will remain "fast" register, r10 will become "slow" register below 32: (bf) r9 = r10 // JIT maps BPF reg to x86 reg: // r9 -> r15 (callee saved) // r10 -> rbp // train store forward prediction to break dependency link between both r9 // and r10 by evicting them from the predictor's LRU table. 33: (61) r0 = *(u32 *)(r7 +24576) 34: (63) *(u32 *)(r7 +29696) = r0 35: (61) r0 = *(u32 *)(r7 +24580) 36: (63) *(u32 *)(r7 +29700) = r0 37: (61) r0 = *(u32 *)(r7 +24584) 38: (63) *(u32 *)(r7 +29704) = r0 39: (61) r0 = *(u32 *)(r7 +24588) 40: (63) *(u32 *)(r7 +29708) = r0 [...] 543: (61) r0 = *(u32 *)(r7 +25596) 544: (63) *(u32 *)(r7 +30716) = r0 // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain // in hardware registers. rbp becomes slow due to push/pop latency. below is // disasm of bpf_ringbuf_output() helper for better visual context: // // ffffffff8117ee20: 41 54 push r12 // ffffffff8117ee22: 55 push rbp // ffffffff8117ee23: 53 push rbx // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken // [...] // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea // ffffffff8117eee7: 5b pop rbx // ffffffff8117eee8: 5d pop rbp // ffffffff8117eee9: 4c 89 e0 mov rax,r12 // ffffffff8117eeec: 41 5c pop r12 // ffffffff8117eeee: c3 ret 545: (18) r1 = map[id:4] 547: (bf) r2 = r7 548: (b7) r3 = 0 549: (b7) r4 = 4 550: (85) call bpf_ringbuf_output#194288 // instruction 551 inserted by verifier \ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here // storing map value pointer r7 at fp-16 | since value of r10 is "slow". 552: (7b) *(u64 *)(r10 -16) = r7 / // following "fast" read to the same memory location, but due to dependency // misprediction it will speculatively execute before insn 551/552 completes. 553: (79) r2 = *(u64 *)(r9 -16) // in speculative domain contains attacker controlled r2. in non-speculative // domain this contains r7, and thus accesses r7 +0 below. 554: (71) r3 = *(u8 *)(r2 +0) // leak r3 As can be seen, the current speculative store bypass mitigation which the verifier inserts at line 551 is insufficient since /both/, the write of the zero sanitation as well as the map value pointer are a high latency instruction due to prior memory access via push/pop of r10 (rbp) in contrast to the low latency read in line 553 as r9 (r15) which stays in hardware registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, fp-16 can still be r2. Initial thoughts to address this issue was to track spilled pointer loads from stack and enforce their load via LDX through r10 as well so that /both/ the preemptive store of zero /as well as/ the load use the /same/ register such that a dependency is created between the store and load. However, this option is not sufficient either since it can be bypassed as well under speculation. An updated attack with pointer spill/fills now _all_ based on r10 would look as follows: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value [...] // longer store forward prediction training sequence than before. 2062: (61) r0 = *(u32 *)(r7 +25588) 2063: (63) *(u32 *)(r7 +30708) = r0 2064: (61) r0 = *(u32 *)(r7 +25592) 2065: (63) *(u32 *)(r7 +30712) = r0 2066: (61) r0 = *(u32 *)(r7 +25596) 2067: (63) *(u32 *)(r7 +30716) = r0 // store the speculative load address (scalar) this time after the store // forward prediction training. 2068: (7b) *(u64 *)(r10 -16) = r2 // preoccupy the CPU store port by running sequence of dummy stores. 2069: (63) *(u32 *)(r7 +29696) = r0 2070: (63) *(u32 *)(r7 +29700) = r0 2071: (63) *(u32 *)(r7 +29704) = r0 2072: (63) *(u32 *)(r7 +29708) = r0 2073: (63) *(u32 *)(r7 +29712) = r0 2074: (63) *(u32 *)(r7 +29716) = r0 2075: (63) *(u32 *)(r7 +29720) = r0 2076: (63) *(u32 *)(r7 +29724) = r0 2077: (63) *(u32 *)(r7 +29728) = r0 2078: (63) *(u32 *)(r7 +29732) = r0 2079: (63) *(u32 *)(r7 +29736) = r0 2080: (63) *(u32 *)(r7 +29740) = r0 2081: (63) *(u32 *)(r7 +29744) = r0 2082: (63) *(u32 *)(r7 +29748) = r0 2083: (63) *(u32 *)(r7 +29752) = r0 2084: (63) *(u32 *)(r7 +29756) = r0 2085: (63) *(u32 *)(r7 +29760) = r0 2086: (63) *(u32 *)(r7 +29764) = r0 2087: (63) *(u32 *)(r7 +29768) = r0 2088: (63) *(u32 *)(r7 +29772) = r0 2089: (63) *(u32 *)(r7 +29776) = r0 2090: (63) *(u32 *)(r7 +29780) = r0 2091: (63) *(u32 *)(r7 +29784) = r0 2092: (63) *(u32 *)(r7 +29788) = r0 2093: (63) *(u32 *)(r7 +29792) = r0 2094: (63) *(u32 *)(r7 +29796) = r0 2095: (63) *(u32 *)(r7 +29800) = r0 2096: (63) *(u32 *)(r7 +29804) = r0 2097: (63) *(u32 *)(r7 +29808) = r0 2098: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; same as before, also including the // sanitation store with 0 from the current mitigation by the verifier. 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. // load from stack intended to bypass stores. 2101: (79) r2 = *(u64 *)(r10 -16) 2102: (71) r3 = *(u8 *)(r2 +0) // leak r3 [...] Looking at the CPU microarchitecture, the scheduler might issue loads (such as seen in line 2101) before stores (line 2099,2100) because the load execution units become available while the store execution unit is still busy with the sequence of dummy stores (line 2069-2098). And so the load may use the prior stored scalar from r2 at address r10 -16 for speculation. The updated attack may work less reliable on CPU microarchitectures where loads and stores share execution resources. This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: Prevent memory disambiguation attack") is insufficient. Moreover, the detection of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been written to a given stack slot where a pointer value is now to be stored does not have sufficient coverage as precondition for the mitigation either; for several reasons outlined as follows: 1) Stack content from prior program runs could still be preserved and is therefore not "random", best example is to split a speculative store bypass attack between tail calls, program A would prepare and store the oob address at a given stack slot and then tail call into program B which does the "slow" store of a pointer to the stack with subsequent "fast" read. From program B PoV such stack slot type is STACK_INVALID, and therefore also must be subject to mitigation. 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) condition, for example, the previous content of that memory location could also be a pointer to map or map value. Without the fix, a speculative store bypass is not mitigated in such precondition and can then lead to a type confusion in the speculative domain leaking kernel memory near these pointer types. While brainstorming on various alternative mitigation possibilities, we also stumbled upon a retrospective from Chrome developers [0]: [...] For variant 4, we implemented a mitigation to zero the unused memory of the heap prior to allocation, which cost about 1% when done concurrently and 4% for scavenging. Variant 4 defeats everything we could think of. We explored more mitigations for variant 4 but the threat proved to be more pervasive and dangerous than we anticipated. For example, stack slots used by the register allocator in the optimizing compiler could be subject to type confusion, leading to pointer crafting. Mitigating type confusion for stack slots alone would have required a complete redesign of the backend of the optimizing compiler, perhaps man years of work, without a guarantee of completeness. [...] From BPF side, the problem space is reduced, however, options are rather limited. One idea that has been explored was to xor-obfuscate pointer spills to the BPF stack: [...] // preoccupy the CPU store port by running sequence of dummy stores. [...] 2106: (63) *(u32 *)(r7 +29796) = r0 2107: (63) *(u32 *)(r7 +29800) = r0 2108: (63) *(u32 *)(r7 +29804) = r0 2109: (63) *(u32 *)(r7 +29808) = r0 2110: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; xored with random 'secret' value // of 943576462 before store ... 2111: (b4) w11 = 943576462 2112: (af) r11 ^= r7 2113: (7b) *(u64 *)(r10 -16) = r11 2114: (79) r11 = *(u64 *)(r10 -16) 2115: (b4) w2 = 943576462 2116: (af) r2 ^= r11 // ... and restored with the same 'secret' value with the help of AX reg. 2117: (71) r3 = *(u8 *)(r2 +0) [...] While the above would not prevent speculation, it would make data leakage infeasible by directing it to random locations. In order to be effective and prevent type confusion under speculation, such random secret would have to be regenerated for each store. The additional complexity involved for a tracking mechanism that prevents jumps such that restoring spilled pointers would not get corrupted is not worth the gain for unprivileged. Hence, the fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC instruction which the x86 JIT translates into a lfence opcode. Inserting the latter in between the store and load instruction is one of the mitigations options [1]. The x86 instruction manual notes: [...] An LFENCE that follows an instruction that stores to memory might complete before the data being stored have become globally visible. [...] The latter meaning that the preceding store instruction finished execution and the store is at minimum guaranteed to be in the CPU's store queue, but it's not guaranteed to be in that CPU's L1 cache at that point (globally visible). The latter would only be guaranteed via sfence. So the load which is guaranteed to execute after the lfence for that local CPU would have to rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: [...] For every store operation that is added to the ROB, an entry is allocated in the store buffer. This entry requires both the virtual and physical address of the target. Only if there is no free entry in the store buffer, the frontend stalls until there is an empty slot available in the store buffer again. Otherwise, the CPU can immediately continue adding subsequent instructions to the ROB and execute them out of order. On Intel CPUs, the store buffer has up to 56 entries. [...] One small upside on the fix is that it lifts constraints from af86ca4e3088 where the sanitize_stack_off relative to r10 must be the same when coming from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX or BPF_ST instruction. This happens either when we store a pointer or data value to the BPF stack for the first time, or upon later pointer spills. The former needs to be enforced since otherwise stale stack data could be leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | BPF_NOSPEC mapping is currently optimized away, but others could emit a speculation barrier as well if necessary. For real-world unprivileged programs e.g. generated by LLVM, pointer spill/fill is only generated upon register pressure and LLVM only tries to do that for pointers which are not used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC sanitation for the STACK_INVALID case when the first write to a stack slot occurs e.g. upon map lookup. In future we might refine ways to mitigate the latter cost. [0] https://arxiv.org/pdf/1902.05178.pdf [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ [2] https://arxiv.org/pdf/1905.05725.pdf Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 87 ++++++++++++++++---------------------------- 2 files changed, 33 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7ba7e800d472..828d08afeee0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,8 +340,8 @@ struct bpf_insn_aux_data { }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int sanitize_stack_off; /* stack slot to be cleared */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 657062cb4d85..f9bda5476ea5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0) reg = &cur->regs[value_regno]; + if (!env->bypass_spec_v4) { + bool sanitize = reg && is_spillable_regtype(reg->type); + + for (i = 0; i < size; i++) { + if (state->stack[spi].slot_type[i] == STACK_INVALID) { + sanitize = true; + break; + } + } + + if (sanitize) + env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + } if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { @@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - - if (!env->bypass_spec_v4) { - bool sanitize = false; - - if (state->stack[spi].slot_type[0] == STACK_SPILL && - register_is_const(&state->stack[spi].spilled_ptr)) - sanitize = true; - for (i = 0; i < BPF_REG_SIZE; i++) - if (state->stack[spi].slot_type[i] == STACK_MISC) { - sanitize = true; - break; - } - if (sanitize) { - int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; - int soff = (-spi - 1) * BPF_REG_SIZE; - - /* detected reuse of integer stack slot with a pointer - * which means either llvm is reusing stack slot or - * an attacker is trying to exploit CVE-2018-3639 - * (speculative store bypass) - * Have to sanitize that slot with preemptive - * store of zero. - */ - if (*poff && *poff != soff) { - /* disallow programs where single insn stores - * into two different stack slots, since verifier - * cannot sanitize them - */ - verbose(env, - "insn %d cannot access two stack slots fp%d and fp%d", - insn_idx, *poff, soff); - return -EINVAL; - } - *poff = soff; - } - } save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -11913,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; + bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW)) + ctx_access = true; + } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW) || + insn->code == (BPF_ST | BPF_MEM | BPF_B) || + insn->code == (BPF_ST | BPF_MEM | BPF_H) || + insn->code == (BPF_ST | BPF_MEM | BPF_W) || + insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - else + ctx_access = BPF_CLASS(insn->code) == BPF_STX; + } else { continue; + } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_off) { + env->insn_aux_data[i + delta].sanitize_stack_spill) { struct bpf_insn patch[] = { - /* Sanitize suspicious stack slot with zero. - * There are no memory dependencies for this store, - * since it's only using frame pointer and immediate - * constant of zero - */ - BPF_ST_MEM(BPF_DW, BPF_REG_FP, - env->insn_aux_data[i + delta].sanitize_stack_off, - 0), - /* the original STX instruction will immediately - * overwrite the same stack slot with appropriate value - */ *insn, + BPF_ST_NOSPEC(), }; cnt = ARRAY_SIZE(patch); @@ -11955,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } + if (!ctx_access) + continue; + switch (env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) -- cgit v1.2.3