From b6ec79518ef0c84d1ed0f76b8af9592a75eb29b6 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 27 Jan 2022 16:32:40 +0800 Subject: bpf, x86: Remove unnecessary handling of BPF_SUB atomic op According to the LLVM commit (https://reviews.llvm.org/D72184), sync_fetch_and_sub() is implemented as a negation followed by sync_fetch_and_add(), so there will be no BPF_SUB op, thus just remove it. BPF_SUB is also rejected by the verifier anyway. Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Brendan Jackman Link: https://lore.kernel.org/bpf/20220127083240.1425481-1-houtao1@huawei.com --- arch/x86/net/bpf_jit_comp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2b1e266ff95c..36f6fc3e6e69 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -787,7 +787,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* emit opcode */ switch (atomic_op) { case BPF_ADD: - case BPF_SUB: case BPF_AND: case BPF_OR: case BPF_XOR: -- cgit v1.2.3 From b5e975d256dbfebd62413eb04fbff6803f02a43c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sun, 30 Jan 2022 17:29:15 +0800 Subject: bpf, arm64: Enable kfunc call Since commit b2eed9b58811 ("arm64/kernel: kaslr: reduce module randomization range to 2 GB"), for arm64 whether KASLR is enabled or not, the module is placed within 2GB of the kernel region, so s32 in bpf_kfunc_desc is sufficient to represente the offset of module function relative to __bpf_call_base. The only thing needed is to override bpf_jit_supports_kfunc_call(). Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220130092917.14544-2-hotforest@gmail.com --- arch/arm64/net/bpf_jit_comp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e96d4d87291f..74f9a9b6a053 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1143,6 +1143,11 @@ out: return prog; } +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + u64 bpf_jit_alloc_exec_limit(void) { return VMALLOC_END - VMALLOC_START; -- cgit v1.2.3 From fac54e2bfb5be2b0bbf115fe80d45f59fd773048 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:34 -0800 Subject: x86/Kconfig: Select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP This enables module_alloc() to allocate huge page for 2MB+ requests. To check the difference of this change, we need enable config CONFIG_PTDUMP_DEBUGFS, and call module_alloc(2MB). Before the change, /sys/kernel/debug/page_tables/kernel shows pte for this map. With the change, /sys/kernel/debug/page_tables/ show pmd for thie map. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-2-song@kernel.org --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6fddb63271d9..e0e0d00cf103 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -159,6 +159,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 -- cgit v1.2.3 From 0e06b40371682b6b70ed0302a7baec0698ada95c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:38 -0800 Subject: x86/alternative: Introduce text_poke_copy This will be used by BPF jit compiler to dump JITed binary to a RX huge page, and thus allow multiple BPF programs sharing the a huge (2MB) page. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20220204185742.271030-6-song@kernel.org --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index b7421780e4e9..4cc18ba1b75e 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); +extern void *text_poke_copy(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5007c3ffe96f..018b61febf0e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1102,6 +1102,40 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(addr, opcode, len); } +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke((void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + static void do_sync_core(void *info) { sync_core(); -- cgit v1.2.3 From ebc1415d9b4f043cef5a1fb002ec316e32167e7a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:39 -0800 Subject: bpf: Introduce bpf_arch_text_copy This will be used to copy JITed text to RO protected module memory. On x86, bpf_arch_text_copy is implemented with text_poke_copy. bpf_arch_text_copy returns pointer to dst on success, and ERR_PTR(errno) on errors. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-7-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 7 +++++++ include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 36f6fc3e6e69..c13d148f7396 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2412,3 +2412,10 @@ bool bpf_jit_supports_kfunc_call(void) { return true; } + +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + if (text_poke_copy(dst, src, len) == NULL) + return ERR_PTR(-EINVAL); + return dst; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 366f88afd56b..ea0d7fd4a410 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2362,6 +2362,8 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void *bpf_arch_text_copy(void *dst, void *src, size_t len); + struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e3fe53df0a71..a5ec480f9862 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2440,6 +2440,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit v1.2.3 From 1022a5498f6f745c3b5fd3f050a5e11e7ca354f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:42 -0800 Subject: bpf, x86_64: Use bpf_jit_binary_pack_alloc Use bpf_jit_binary_pack_alloc in x86_64 jit. The jit engine first writes the program to the rw buffer. When the jit is done, the program is copied to the final location with bpf_jit_binary_pack_finalize. Note that we need to do bpf_tail_call_direct_fixup after finalize. Therefore, the text_live = false logic in __bpf_arch_text_poke is no longer needed. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-10-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 58 ++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index c13d148f7396..643f38b91e30 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -330,8 +330,7 @@ static int emit_jump(u8 **pprog, void *func, void *ip) } static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr, - const bool text_live) + void *old_addr, void *new_addr) { const u8 *nop_insn = x86_nops[5]; u8 old_insn[X86_PATCH_SIZE]; @@ -365,10 +364,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - if (text_live) - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); - else - memcpy(ip, new_insn, X86_PATCH_SIZE); + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: @@ -384,7 +380,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, /* BPF poking in modules is not supported */ return -EINVAL; - return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); + return __bpf_arch_text_poke(ip, t, old_addr, new_addr); } #define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) @@ -558,24 +554,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) mutex_lock(&array->aux->poke_mutex); target = array->ptrs[poke->tail_call.key]; if (target) { - /* Plain memcpy is used when image is not live yet - * and still not locked as read-only. Once poke - * location is active (poke->tailcall_target_stable), - * any parallel bpf_arch_text_poke() might occur - * still on the read-write image until we finally - * locked it as read-only. Both modifications on - * the given image are under text_mutex to avoid - * interference. - */ ret = __bpf_arch_text_poke(poke->tailcall_target, BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + - poke->adj_off, false); + poke->adj_off); BUG_ON(ret < 0); ret = __bpf_arch_text_poke(poke->tailcall_bypass, BPF_MOD_JUMP, (u8 *)poke->tailcall_target + - X86_PATCH_SIZE, NULL, false); + X86_PATCH_SIZE, NULL); BUG_ON(ret < 0); } WRITE_ONCE(poke->tailcall_target_stable, true); @@ -866,7 +853,7 @@ static void emit_nops(u8 **pprog, int len) #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; @@ -893,8 +880,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, push_callee_regs(&prog, callee_regs_used); ilen = prog - temp; - if (image) - memcpy(image + proglen, temp, ilen); + if (rw_image) + memcpy(rw_image + proglen, temp, ilen); proglen += ilen; addrs[0] = proglen; prog = temp; @@ -1323,6 +1310,9 @@ st: if (is_imm8(insn->off)) pr_err("extable->insn doesn't fit into 32-bit\n"); return -EFAULT; } + /* switch ex to rw buffer for writes */ + ex = (void *)rw_image + ((void *)ex - (void *)image); + ex->insn = delta; ex->data = EX_TYPE_BPF; @@ -1705,7 +1695,7 @@ emit_jmp: pr_err("bpf_jit: fatal error\n"); return -EFAULT; } - memcpy(image + proglen, temp, ilen); + memcpy(rw_image + proglen, temp, ilen); } proglen += ilen; addrs[i] = proglen; @@ -2246,6 +2236,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) } struct x64_jit_data { + struct bpf_binary_header *rw_header; struct bpf_binary_header *header; int *addrs; u8 *image; @@ -2258,6 +2249,7 @@ struct x64_jit_data { struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; struct x64_jit_data *jit_data; @@ -2266,6 +2258,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bool tmp_blinded = false; bool extra_pass = false; bool padding = false; + u8 *rw_image = NULL; u8 *image = NULL; int *addrs; int pass; @@ -2301,6 +2294,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) oldproglen = jit_data->proglen; image = jit_data->image; header = jit_data->header; + rw_header = jit_data->rw_header; + rw_image = (void *)rw_header + ((void *)image - (void *)header); extra_pass = true; padding = true; goto skip_init_addrs; @@ -2331,12 +2326,12 @@ skip_init_addrs: for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES) padding = true; - proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding); + proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; if (header) - bpf_jit_binary_free(header); + bpf_jit_binary_pack_free(header, rw_header); prog = orig_prog; goto out_addrs; } @@ -2360,8 +2355,9 @@ out_image: sizeof(struct exception_table_entry); /* allocate module memory for x86 insns and extable */ - header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, - &image, align, jit_fill_hole); + header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size, + &image, align, &rw_header, &rw_image, + jit_fill_hole); if (!header) { prog = orig_prog; goto out_addrs; @@ -2377,14 +2373,22 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + /* + * bpf_jit_binary_pack_finalize fails in two scenarios: + * 1) header is not pointing to proper module memory; + * 2) the arch doesn't support bpf_arch_text_copy(). + * + * Both cases are serious bugs that we should not continue. + */ + BUG_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header)); bpf_tail_call_direct_fixup(prog); - bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; jit_data->ctx = ctx; jit_data->proglen = proglen; jit_data->image = image; jit_data->header = header; + jit_data->rw_header = rw_header; } prog->bpf_func = (void *)image; prog->jited = 1; -- cgit v1.2.3 From f95f768f0af4cec806ce86cd67934a10617d96d0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 7 Feb 2022 22:25:33 -0800 Subject: bpf, x86_64: Fail gracefully on bpf_jit_binary_pack_finalize failures Instead of BUG_ON(), fail gracefully and return orig_prog. Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220208062533.3802081-1-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 643f38b91e30..c7db0fe4de2f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2378,9 +2378,13 @@ out_image: * 1) header is not pointing to proper module memory; * 2) the arch doesn't support bpf_arch_text_copy(). * - * Both cases are serious bugs that we should not continue. + * Both cases are serious bugs and justify WARN_ON. */ - BUG_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header)); + if (WARN_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header))) { + prog = orig_prog; + goto out_addrs; + } + bpf_tail_call_direct_fixup(prog); } else { jit_data->addrs = addrs; -- cgit v1.2.3 From 0f350231b5ac8867d552122b9ec88e8d7da00e8c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Feb 2022 14:05:08 -0800 Subject: bpf: Fix leftover header->pages in sparc and powerpc code. Replace header->pages * PAGE_SIZE with new header->size. Fixes: ed2d9e1a26cc ("bpf: Use size instead of pages in bpf_binary_header") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220208220509.4180389-2-song@kernel.org --- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index d6ffdd0f2309..9003c313475d 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -247,7 +247,7 @@ skip_codegen_passes: fp->jited = 1; fp->jited_len = proglen + FUNCTION_DESCR_SIZE; - bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); + bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + bpf_hdr->size); if (!fp->is_func || extra_pass) { bpf_jit_binary_lock_ro(bpf_hdr); bpf_prog_fill_jited_linfo(fp, addrs); diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index b1e38784eb23..fa0759bfe498 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1599,7 +1599,7 @@ skip_init_ctx: if (bpf_jit_enable > 1) bpf_jit_dump(prog->len, image_size, pass, ctx.image); - bpf_flush_icache(header, (u8 *)header + (header->pages * PAGE_SIZE)); + bpf_flush_icache(header, (u8 *)header + header->size); if (!prog->is_func || extra_pass) { bpf_jit_binary_lock_ro(header); -- cgit v1.2.3