From 09ece3d0f2ee524b8d1d3c775d9eeb50c40558d8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 May 2018 23:22:31 +0200 Subject: bpf, arm64: save 4 bytes of unneeded stack space Follow-up to 816d9ef32a8b ("bpf, arm64: remove ld_abs/ld_ind") in that the extra 4 byte JIT scratchpad is not needed anymore since it was in ld_abs/ld_ind as stack buffer for bpf_load_pointer(). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0b40c8fb0706..85113cab2046 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -188,7 +187,7 @@ static int build_prologue(struct jit_ctx *ctx) * | ... | BPF prog stack * | | * +-----+ <= (BPF_FP - prog->aux->stack_depth) - * |RSVD | JIT scratchpad + * |RSVD | padding * current A64_SP => +-----+ <= (BPF_FP - ctx->stack_size) * | | * | ... | Function call stack @@ -220,9 +219,7 @@ static int build_prologue(struct jit_ctx *ctx) return -1; } - /* 4 byte extra for skb_copy_bits buffer */ - ctx->stack_size = prog->aux->stack_depth + 4; - ctx->stack_size = STACK_ALIGN(ctx->stack_size); + ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth); /* Set up function call stack */ emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); -- cgit v1.2.3 From 6d2eea6fb0791bf191043a68937c9aa59c5ea678 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 May 2018 23:22:32 +0200 Subject: bpf, arm64: optimize 32/64 immediate emission Improve the JIT to emit 64 and 32 bit immediates, the current algorithm is not optimal and we often emit more instructions than actually needed. arm64 has movz, movn, movk variants but for the current 64 bit immediates we only use movz with a series of movk when needed. For example loading ffffffffffffabab emits the following 4 instructions in the JIT today: * movz: abab, shift: 0, result: 000000000000abab * movk: ffff, shift: 16, result: 00000000ffffabab * movk: ffff, shift: 32, result: 0000ffffffffabab * movk: ffff, shift: 48, result: ffffffffffffabab Whereas after the patch the same load only needs a single instruction: * movn: 5454, shift: 0, result: ffffffffffffabab Another example where two extra instructions can be saved: * movz: abab, shift: 0, result: 000000000000abab * movk: 1f2f, shift: 16, result: 000000001f2fabab * movk: ffff, shift: 32, result: 0000ffff1f2fabab * movk: ffff, shift: 48, result: ffffffff1f2fabab After the patch: * movn: e0d0, shift: 16, result: ffffffff1f2fffff * movk: abab, shift: 0, result: ffffffff1f2fabab Another example with movz, before: * movz: 0000, shift: 0, result: 0000000000000000 * movk: fea0, shift: 32, result: 0000fea000000000 After: * movz: fea0, shift: 32, result: 0000fea000000000 Moreover, reuse emit_a64_mov_i() for 32 bit immediates that are loaded via emit_a64_mov_i64() which is a similar optimization as done in 6fe8b9c1f41d ("bpf, x64: save several bytes by using mov over movabsq when possible"). On arm64, the latter allows to use a single instruction with movn due to zero extension where otherwise two would be needed. And last but not least add a missing optimization in emit_a64_mov_i() where movn is used but the subsequent movk not needed. With some of the Cilium programs in use, this shrinks the needed instructions by about three percent. Tested on Cavium ThunderX CN8890. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 85 +++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 31 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 85113cab2046..c8a2620312c0 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -79,23 +79,66 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx) ctx->idx++; } +static inline void emit_a64_mov_i(const int is64, const int reg, + const s32 val, struct jit_ctx *ctx) +{ + u16 hi = val >> 16; + u16 lo = val & 0xffff; + + if (hi & 0x8000) { + if (hi == 0xffff) { + emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx); + } else { + emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx); + if (lo != 0xffff) + emit(A64_MOVK(is64, reg, lo, 0), ctx); + } + } else { + emit(A64_MOVZ(is64, reg, lo, 0), ctx); + if (hi) + emit(A64_MOVK(is64, reg, hi, 16), ctx); + } +} + +static int i64_i16_blocks(const u64 val, bool inverse) +{ + return (((val >> 0) & 0xffff) != (inverse ? 0xffff : 0x0000)) + + (((val >> 16) & 0xffff) != (inverse ? 0xffff : 0x0000)) + + (((val >> 32) & 0xffff) != (inverse ? 0xffff : 0x0000)) + + (((val >> 48) & 0xffff) != (inverse ? 0xffff : 0x0000)); +} + static inline void emit_a64_mov_i64(const int reg, const u64 val, struct jit_ctx *ctx) { - u64 tmp = val; - int shift = 0; - - emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx); - tmp >>= 16; - shift += 16; - while (tmp) { - if (tmp & 0xffff) - emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx); - tmp >>= 16; - shift += 16; + u64 nrm_tmp = val, rev_tmp = ~val; + bool inverse; + int shift; + + if (!(nrm_tmp >> 32)) + return emit_a64_mov_i(0, reg, (u32)val, ctx); + + inverse = i64_i16_blocks(nrm_tmp, true) < i64_i16_blocks(nrm_tmp, false); + shift = max(round_down((inverse ? (fls64(rev_tmp) - 1) : + (fls64(nrm_tmp) - 1)), 16), 0); + if (inverse) + emit(A64_MOVN(1, reg, (rev_tmp >> shift) & 0xffff, shift), ctx); + else + emit(A64_MOVZ(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx); + shift -= 16; + while (shift >= 0) { + if (((nrm_tmp >> shift) & 0xffff) != (inverse ? 0xffff : 0x0000)) + emit(A64_MOVK(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx); + shift -= 16; } } +/* + * This is an unoptimized 64 immediate emission used for BPF to BPF call + * addresses. It will always do a full 64 bit decomposition as otherwise + * more complexity in the last extra pass is required since we previously + * reserved 4 instructions for the address. + */ static inline void emit_addr_mov_i64(const int reg, const u64 val, struct jit_ctx *ctx) { @@ -110,26 +153,6 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val, } } -static inline void emit_a64_mov_i(const int is64, const int reg, - const s32 val, struct jit_ctx *ctx) -{ - u16 hi = val >> 16; - u16 lo = val & 0xffff; - - if (hi & 0x8000) { - if (hi == 0xffff) { - emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx); - } else { - emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx); - emit(A64_MOVK(is64, reg, lo, 0), ctx); - } - } else { - emit(A64_MOVZ(is64, reg, lo, 0), ctx); - if (hi) - emit(A64_MOVK(is64, reg, hi, 16), ctx); - } -} - static inline int bpf2a64_offset(int bpf_to, int bpf_from, const struct jit_ctx *ctx) { -- cgit v1.2.3 From 56ea6a8b4949c66d550d05e92da8ef4351b90027 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 May 2018 23:22:33 +0200 Subject: bpf, arm64: save 4 bytes in prologue when ebpf insns came from cbpf We can trivially save 4 bytes in prologue for cBPF since tail calls can never be used from there. The register push/pop is pairwise, here, x25 (fp) and x26 (tcc), so no point in changing that, only reset to zero is not needed. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c8a2620312c0..a6fdaea07c63 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -185,7 +185,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) /* Tail call offset to jump into */ #define PROLOGUE_OFFSET 7 -static int build_prologue(struct jit_ctx *ctx) +static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) { const struct bpf_prog *prog = ctx->prog; const u8 r6 = bpf2a64[BPF_REG_6]; @@ -232,14 +232,16 @@ static int build_prologue(struct jit_ctx *ctx) /* Set up BPF prog stack base register */ emit(A64_MOV(1, fp, A64_SP), ctx); - /* Initialize tail_call_cnt */ - emit(A64_MOVZ(1, tcc, 0, 0), ctx); + if (!ebpf_from_cbpf) { + /* Initialize tail_call_cnt */ + emit(A64_MOVZ(1, tcc, 0, 0), ctx); - cur_offset = ctx->idx - idx0; - if (cur_offset != PROLOGUE_OFFSET) { - pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", - cur_offset, PROLOGUE_OFFSET); - return -1; + cur_offset = ctx->idx - idx0; + if (cur_offset != PROLOGUE_OFFSET) { + pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", + cur_offset, PROLOGUE_OFFSET); + return -1; + } } ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth); @@ -806,6 +808,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; struct arm64_jit_data *jit_data; + bool was_classic = bpf_prog_was_classic(prog); bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; @@ -860,7 +863,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_off; } - if (build_prologue(&ctx)) { + if (build_prologue(&ctx, was_classic)) { prog = orig_prog; goto out_off; } @@ -883,7 +886,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) skip_init_ctx: ctx.idx = 0; - build_prologue(&ctx); + build_prologue(&ctx, was_classic); if (build_body(&ctx)) { bpf_jit_binary_free(header); -- cgit v1.2.3