summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/net/bpf_jit_comp.c219
-rw-r--r--include/linux/bpf-cgroup.h2
-rw-r--r--include/linux/filter.h2
-rw-r--r--kernel/bpf/verifier.c4
-rw-r--r--samples/bpf/Makefile4
-rw-r--r--samples/bpf/cpustat_kern.c281
-rw-r--r--samples/bpf/cpustat_user.c219
-rw-r--r--samples/bpf/xdp_redirect_user.c7
-rw-r--r--samples/sockmap/Makefile2
-rw-r--r--samples/sockmap/sockmap_user.c1
-rw-r--r--tools/testing/selftests/bpf/Makefile4
-rw-r--r--tools/testing/selftests/bpf/test_tcpbpf_user.c6
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c128
13 files changed, 779 insertions, 100 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 45e4eb5bcbb2..cbf94d44f3d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -61,7 +61,12 @@ static bool is_imm8(int value)
static bool is_simm32(s64 value)
{
- return value == (s64) (s32) value;
+ return value == (s64)(s32)value;
+}
+
+static bool is_uimm32(u64 value)
+{
+ return value == (u64)(u32)value;
}
/* mov dst, src */
@@ -212,7 +217,7 @@ struct jit_context {
/* emit x64 prologue code for BPF program and check it's size.
* bpf_tail_call helper will skip it while jumping into another program
*/
-static void emit_prologue(u8 **pprog, u32 stack_depth)
+static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
{
u8 *prog = *pprog;
int cnt = 0;
@@ -247,18 +252,21 @@ static void emit_prologue(u8 **pprog, u32 stack_depth)
/* mov qword ptr [rbp+24],r15 */
EMIT4(0x4C, 0x89, 0x7D, 24);
- /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
- * we need to reset the counter to 0. It's done in two instructions,
- * resetting rax register to 0 (xor on eax gets 0 extended), and
- * moving it to the counter location.
- */
+ if (!ebpf_from_cbpf) {
+ /* Clear the tail call counter (tail_call_cnt): for eBPF tail
+ * calls we need to reset the counter to 0. It's done in two
+ * instructions, resetting rax register to 0, and moving it
+ * to the counter location.
+ */
+
+ /* xor eax, eax */
+ EMIT2(0x31, 0xc0);
+ /* mov qword ptr [rbp+32], rax */
+ EMIT4(0x48, 0x89, 0x45, 32);
- /* xor eax, eax */
- EMIT2(0x31, 0xc0);
- /* mov qword ptr [rbp+32], rax */
- EMIT4(0x48, 0x89, 0x45, 32);
+ BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+ }
- BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
*pprog = prog;
}
@@ -356,6 +364,86 @@ static void emit_load_skb_data_hlen(u8 **pprog)
*pprog = prog;
}
+static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
+ u32 dst_reg, const u32 imm32)
+{
+ u8 *prog = *pprog;
+ u8 b1, b2, b3;
+ int cnt = 0;
+
+ /* optimization: if imm32 is positive, use 'mov %eax, imm32'
+ * (which zero-extends imm32) to save 2 bytes.
+ */
+ if (sign_propagate && (s32)imm32 < 0) {
+ /* 'mov %rax, imm32' sign extends imm32 */
+ b1 = add_1mod(0x48, dst_reg);
+ b2 = 0xC7;
+ b3 = 0xC0;
+ EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
+ goto done;
+ }
+
+ /* optimization: if imm32 is zero, use 'xor %eax, %eax'
+ * to save 3 bytes.
+ */
+ if (imm32 == 0) {
+ if (is_ereg(dst_reg))
+ EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+ goto done;
+ }
+
+ /* mov %eax, imm32 */
+ if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+done:
+ *pprog = prog;
+}
+
+static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
+ const u32 imm32_hi, const u32 imm32_lo)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+ /* For emitting plain u32, where sign bit must not be
+ * propagated LLVM tends to load imm64 over mov32
+ * directly, so save couple of bytes by just doing
+ * 'mov %eax, imm32' instead.
+ */
+ emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+ } else {
+ /* movabsq %rax, imm64 */
+ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
+ EMIT(imm32_lo, 4);
+ EMIT(imm32_hi, 4);
+ }
+
+ *pprog = prog;
+}
+
+static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is64) {
+ /* mov dst, src */
+ EMIT_mov(dst_reg, src_reg);
+ } else {
+ /* mov32 dst, src */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+ }
+
+ *pprog = prog;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
@@ -369,7 +457,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int proglen = 0;
u8 *prog = temp;
- emit_prologue(&prog, bpf_prog->aux->stack_depth);
+ emit_prologue(&prog, bpf_prog->aux->stack_depth,
+ bpf_prog_was_classic(bpf_prog));
if (seen_ld_abs)
emit_load_skb_data_hlen(&prog);
@@ -378,7 +467,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
const s32 imm32 = insn->imm;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
- u8 b1 = 0, b2 = 0, b3 = 0;
+ u8 b2 = 0, b3 = 0;
s64 jmp_offset;
u8 jmp_cond;
bool reload_skb_data;
@@ -414,16 +503,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
break;
- /* mov dst, src */
case BPF_ALU64 | BPF_MOV | BPF_X:
- EMIT_mov(dst_reg, src_reg);
- break;
-
- /* mov32 dst, src */
case BPF_ALU | BPF_MOV | BPF_X:
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
- EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+ emit_mov_reg(&prog,
+ BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, src_reg);
break;
/* neg dst */
@@ -486,58 +570,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
break;
case BPF_ALU64 | BPF_MOV | BPF_K:
- /* optimization: if imm32 is positive,
- * use 'mov eax, imm32' (which zero-extends imm32)
- * to save 2 bytes
- */
- if (imm32 < 0) {
- /* 'mov rax, imm32' sign extends imm32 */
- b1 = add_1mod(0x48, dst_reg);
- b2 = 0xC7;
- b3 = 0xC0;
- EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
- break;
- }
-
case BPF_ALU | BPF_MOV | BPF_K:
- /* optimization: if imm32 is zero, use 'xor <dst>,<dst>'
- * to save 3 bytes.
- */
- if (imm32 == 0) {
- if (is_ereg(dst_reg))
- EMIT1(add_2mod(0x40, dst_reg, dst_reg));
- b2 = 0x31; /* xor */
- b3 = 0xC0;
- EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
- break;
- }
-
- /* mov %eax, imm32 */
- if (is_ereg(dst_reg))
- EMIT1(add_1mod(0x40, dst_reg));
- EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+ emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, imm32);
break;
case BPF_LD | BPF_IMM | BPF_DW:
- /* optimization: if imm64 is zero, use 'xor <dst>,<dst>'
- * to save 7 bytes.
- */
- if (insn[0].imm == 0 && insn[1].imm == 0) {
- b1 = add_2mod(0x48, dst_reg, dst_reg);
- b2 = 0x31; /* xor */
- b3 = 0xC0;
- EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg));
-
- insn++;
- i++;
- break;
- }
-
- /* movabsq %rax, imm64 */
- EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
- EMIT(insn[0].imm, 4);
- EMIT(insn[1].imm, 4);
-
+ emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm);
insn++;
i++;
break;
@@ -594,36 +633,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU64 | BPF_MUL | BPF_K:
case BPF_ALU64 | BPF_MUL | BPF_X:
- EMIT1(0x50); /* push rax */
- EMIT1(0x52); /* push rdx */
+ {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+
+ if (dst_reg != BPF_REG_0)
+ EMIT1(0x50); /* push rax */
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x52); /* push rdx */
/* mov r11, dst_reg */
EMIT_mov(AUX_REG, dst_reg);
if (BPF_SRC(insn->code) == BPF_X)
- /* mov rax, src_reg */
- EMIT_mov(BPF_REG_0, src_reg);
+ emit_mov_reg(&prog, is64, BPF_REG_0, src_reg);
else
- /* mov rax, imm32 */
- EMIT3_off32(0x48, 0xC7, 0xC0, imm32);
+ emit_mov_imm32(&prog, is64, BPF_REG_0, imm32);
- if (BPF_CLASS(insn->code) == BPF_ALU64)
+ if (is64)
EMIT1(add_1mod(0x48, AUX_REG));
else if (is_ereg(AUX_REG))
EMIT1(add_1mod(0x40, AUX_REG));
/* mul(q) r11 */
EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
- /* mov r11, rax */
- EMIT_mov(AUX_REG, BPF_REG_0);
-
- EMIT1(0x5A); /* pop rdx */
- EMIT1(0x58); /* pop rax */
-
- /* mov dst_reg, r11 */
- EMIT_mov(dst_reg, AUX_REG);
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x5A); /* pop rdx */
+ if (dst_reg != BPF_REG_0) {
+ /* mov dst_reg, rax */
+ EMIT_mov(dst_reg, BPF_REG_0);
+ EMIT1(0x58); /* pop rax */
+ }
break;
-
+ }
/* shifts */
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_K:
@@ -641,7 +682,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_RSH: b3 = 0xE8; break;
case BPF_ARSH: b3 = 0xF8; break;
}
- EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
+
+ if (imm32 == 1)
+ EMIT2(0xD1, add_1reg(b3, dst_reg));
+ else
+ EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
break;
case BPF_ALU | BPF_LSH | BPF_X:
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a7f16e0f8d68..8a4566691c8f 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -96,7 +96,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
({ \
int __ret = 0; \
- if (cgroup_bpf_enabled && sk) { \
+ if (cgroup_bpf_enabled) { \
__ret = __cgroup_bpf_run_filter_sk(sk, \
BPF_CGROUP_INET_SOCK_CREATE); \
} \
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 276932d75975..fdb691b520c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -20,7 +20,6 @@
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
-#include <net/xdp.h>
#include <net/sch_generic.h>
#include <uapi/linux/filter.h>
@@ -30,6 +29,7 @@ struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
+struct xdp_rxq_info;
/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5fb69a85d967..3c74b163eaeb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -508,10 +508,6 @@ err:
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-#define CALLEE_SAVED_REGS 5
-static const int callee_saved[CALLEE_SAVED_REGS] = {
- BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
-};
static void __mark_reg_not_init(struct bpf_reg_state *reg);
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..2c2a587e0942 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
hostprogs-y += xdp_monitor
hostprogs-y += xdp_rxq_info
hostprogs-y += syscall_tp
+hostprogs-y += cpustat
# Libbpf dependencies
LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,7 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -144,6 +146,7 @@ always += xdp_monitor_kern.o
always += xdp_rxq_info_kern.o
always += xdp2skb_meta_kern.o
always += syscall_tp_kern.o
+always += cpustat_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +191,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
HOSTLOADLIBES_xdp_monitor += -lelf
HOSTLOADLIBES_xdp_rxq_info += -lelf
HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_cpustat += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
new file mode 100644
index 000000000000..68c84da065b1
--- /dev/null
+++ b/samples/bpf/cpustat_kern.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/*
+ * The CPU number, cstate number and pstate number are based
+ * on 96boards Hikey with octa CA53 CPUs.
+ *
+ * Every CPU have three idle states for cstate:
+ * WFI, CPU_OFF, CLUSTER_OFF
+ *
+ * Every CPU have 5 operating points:
+ * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
+ *
+ * This code is based on these assumption and other platforms
+ * need to adjust these definitions.
+ */
+#define MAX_CPU 8
+#define MAX_PSTATE_ENTRIES 5
+#define MAX_CSTATE_ENTRIES 3
+
+static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
+
+/*
+ * my_map structure is used to record cstate and pstate index and
+ * timestamp (Idx, Ts), when new event incoming we need to update
+ * combination for new state index and timestamp (Idx`, Ts`).
+ *
+ * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
+ * interval for the previous state: Duration(Idx) = Ts` - Ts.
+ *
+ * Every CPU has one below array for recording state index and
+ * timestamp, and record for cstate and pstate saperately:
+ *
+ * +--------------------------+
+ * | cstate timestamp |
+ * +--------------------------+
+ * | cstate index |
+ * +--------------------------+
+ * | pstate timestamp |
+ * +--------------------------+
+ * | pstate index |
+ * +--------------------------+
+ */
+#define MAP_OFF_CSTATE_TIME 0
+#define MAP_OFF_CSTATE_IDX 1
+#define MAP_OFF_PSTATE_TIME 2
+#define MAP_OFF_PSTATE_IDX 3
+#define MAP_OFF_NUM 4
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU * MAP_OFF_NUM,
+};
+
+/* cstate_duration records duration time for every idle state per CPU */
+struct bpf_map_def SEC("maps") cstate_duration = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
+};
+
+/* pstate_duration records duration time for every operating point per CPU */
+struct bpf_map_def SEC("maps") pstate_duration = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
+};
+
+/*
+ * The trace events for cpu_idle and cpu_frequency are taken from:
+ * /sys/kernel/debug/tracing/events/power/cpu_idle/format
+ * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
+ *
+ * These two events have same format, so define one common structure.
+ */
+struct cpu_args {
+ u64 pad;
+ u32 state;
+ u32 cpu_id;
+};
+
+/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
+static u32 find_cpu_pstate_idx(u32 frequency)
+{
+ u32 i;
+
+ for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
+ if (frequency == cpu_opps[i])
+ return i;
+ }
+
+ return i;
+}
+
+SEC("tracepoint/power/cpu_idle")
+int bpf_prog1(struct cpu_args *ctx)
+{
+ u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+ u32 key, cpu, pstate_idx;
+ u64 *val;
+
+ if (ctx->cpu_id > MAX_CPU)
+ return 0;
+
+ cpu = ctx->cpu_id;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
+ cts = bpf_map_lookup_elem(&my_map, &key);
+ if (!cts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+ cstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!cstate)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+ pts = bpf_map_lookup_elem(&my_map, &key);
+ if (!pts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+ pstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!pstate)
+ return 0;
+
+ prev_state = *cstate;
+ *cstate = ctx->state;
+
+ if (!*cts) {
+ *cts = bpf_ktime_get_ns();
+ return 0;
+ }
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = cur_ts - *cts;
+ *cts = cur_ts;
+
+ /*
+ * When state doesn't equal to (u32)-1, the cpu will enter
+ * one idle state; for this case we need to record interval
+ * for the pstate.
+ *
+ * OPP2
+ * +---------------------+
+ * OPP1 | |
+ * ---------+ |
+ * | Idle state
+ * +---------------
+ *
+ * |<- pstate duration ->|
+ * ^ ^
+ * pts cur_ts
+ */
+ if (ctx->state != (u32)-1) {
+
+ /* record pstate after have first cpu_frequency event */
+ if (!*pts)
+ return 0;
+
+ delta = cur_ts - *pts;
+
+ pstate_idx = find_cpu_pstate_idx(*pstate);
+ if (pstate_idx >= MAX_PSTATE_ENTRIES)
+ return 0;
+
+ key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+ val = bpf_map_lookup_elem(&pstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+
+ /*
+ * When state equal to (u32)-1, the cpu just exits from one
+ * specific idle state; for this case we need to record
+ * interval for the pstate.
+ *
+ * OPP2
+ * -----------+
+ * | OPP1
+ * | +-----------
+ * | Idle state |
+ * +---------------------+
+ *
+ * |<- cstate duration ->|
+ * ^ ^
+ * cts cur_ts
+ */
+ } else {
+
+ key = cpu * MAX_CSTATE_ENTRIES + prev_state;
+ val = bpf_map_lookup_elem(&cstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+ }
+
+ /* Update timestamp for pstate as new start time */
+ if (*pts)
+ *pts = cur_ts;
+
+ return 0;
+}
+
+SEC("tracepoint/power/cpu_frequency")
+int bpf_prog2(struct cpu_args *ctx)
+{
+ u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+ u32 key, cpu, pstate_idx;
+ u64 *val;
+
+ cpu = ctx->cpu_id;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+ pts = bpf_map_lookup_elem(&my_map, &key);
+ if (!pts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+ pstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!pstate)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+ cstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!cstate)
+ return 0;
+
+ prev_state = *pstate;
+ *pstate = ctx->state;
+
+ if (!*pts) {
+ *pts = bpf_ktime_get_ns();
+ return 0;
+ }
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = cur_ts - *pts;
+ *pts = cur_ts;
+
+ /* When CPU is in idle, bail out to skip pstate statistics */
+ if (*cstate != (u32)(-1))
+ return 0;
+
+ /*
+ * The cpu changes to another different OPP (in below diagram
+ * change frequency from OPP3 to OPP1), need recording interval
+ * for previous frequency OPP3 and update timestamp as start
+ * time for new frequency OPP1.
+ *
+ * OPP3
+ * +---------------------+
+ * OPP2 | |
+ * ---------+ |
+ * | OPP1
+ * +---------------
+ *
+ * |<- pstate duration ->|
+ * ^ ^
+ * pts cur_ts
+ */
+ pstate_idx = find_cpu_pstate_idx(*pstate);
+ if (pstate_idx >= MAX_PSTATE_ENTRIES)
+ return 0;
+
+ key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+ val = bpf_map_lookup_elem(&pstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
new file mode 100644
index 000000000000..2b4cd1ae57c5
--- /dev/null
+++ b/samples/bpf/cpustat_user.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_CPU 8
+#define MAX_PSTATE_ENTRIES 5
+#define MAX_CSTATE_ENTRIES 3
+#define MAX_STARS 40
+
+#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"
+#define CPUFREQ_LOWEST_FREQ "208000"
+#define CPUFREQ_HIGHEST_FREQ "12000000"
+
+struct cpu_stat_data {
+ unsigned long cstate[MAX_CSTATE_ENTRIES];
+ unsigned long pstate[MAX_PSTATE_ENTRIES];
+};
+
+static struct cpu_stat_data stat_data[MAX_CPU];
+
+static void cpu_stat_print(void)
+{
+ int i, j;
+ char state_str[sizeof("cstate-9")];
+ struct cpu_stat_data *data;
+
+ /* Clear screen */
+ printf("\033[2J");
+
+ /* Header */
+ printf("\nCPU states statistics:\n");
+ printf("%-10s ", "state(ms)");
+
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+ sprintf(state_str, "cstate-%d", i);
+ printf("%-11s ", state_str);
+ }
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+ sprintf(state_str, "pstate-%d", i);
+ printf("%-11s ", state_str);
+ }
+
+ printf("\n");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ data = &stat_data[j];
+
+ printf("CPU-%-6d ", j);
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++)
+ printf("%-11ld ", data->cstate[i] / 1000000);
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++)
+ printf("%-11ld ", data->pstate[i] / 1000000);
+
+ printf("\n");
+ }
+}
+
+static void cpu_stat_update(int cstate_fd, int pstate_fd)
+{
+ unsigned long key, value;
+ int c, i;
+
+ for (c = 0; c < MAX_CPU; c++) {
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+ key = c * MAX_CSTATE_ENTRIES + i;
+ bpf_map_lookup_elem(cstate_fd, &key, &value);
+ stat_data[c].cstate[i] = value;
+ }
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+ key = c * MAX_PSTATE_ENTRIES + i;
+ bpf_map_lookup_elem(pstate_fd, &key, &value);
+ stat_data[c].pstate[i] = value;
+ }
+ }
+}
+
+/*
+ * This function is copied from 'idlestat' tool function
+ * idlestat_wake_all() in idlestate.c.
+ *
+ * It sets the self running task affinity to cpus one by one so can wake up
+ * the specific CPU to handle scheduling; this results in all cpus can be
+ * waken up once and produce ftrace event 'trace_cpu_idle'.
+ */
+static int cpu_stat_inject_cpu_idle_event(void)
+{
+ int rcpu, i, ret;
+ cpu_set_t cpumask;
+ cpu_set_t original_cpumask;
+
+ ret = sysconf(_SC_NPROCESSORS_CONF);
+ if (ret < 0)
+ return -1;
+
+ rcpu = sched_getcpu();
+ if (rcpu < 0)
+ return -1;
+
+ /* Keep track of the CPUs we will run on */
+ sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask);
+
+ for (i = 0; i < ret; i++) {
+
+ /* Pointless to wake up ourself */
+ if (i == rcpu)
+ continue;
+
+ /* Pointless to wake CPUs we will not run on */
+ if (!CPU_ISSET(i, &original_cpumask))
+ continue;
+
+ CPU_ZERO(&cpumask);
+ CPU_SET(i, &cpumask);
+
+ sched_setaffinity(0, sizeof(cpumask), &cpumask);
+ }
+
+ /* Enable all the CPUs of the original mask */
+ sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask);
+ return 0;
+}
+
+/*
+ * It's possible to have no any frequency change for long time and cannot
+ * get ftrace event 'trace_cpu_frequency' for long period, this introduces
+ * big deviation for pstate statistics.
+ *
+ * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz
+ * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to
+ * the maximum frequency value 1.2GHz.
+ */
+static int cpu_stat_inject_cpu_frequency_event(void)
+{
+ int len, fd;
+
+ fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY);
+ if (fd < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ return fd;
+ }
+
+ len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ));
+ if (len < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ goto err;
+ }
+
+ len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ));
+ if (len < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ goto err;
+ }
+
+err:
+ close(fd);
+ return len;
+}
+
+static void int_exit(int sig)
+{
+ cpu_stat_inject_cpu_idle_event();
+ cpu_stat_inject_cpu_frequency_event();
+ cpu_stat_update(map_fd[1], map_fd[2]);
+ cpu_stat_print();
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+ int ret;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ ret = cpu_stat_inject_cpu_idle_event();
+ if (ret < 0)
+ return 1;
+
+ ret = cpu_stat_inject_cpu_frequency_event();
+ if (ret < 0)
+ return 1;
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (1) {
+ cpu_stat_update(map_fd[1], map_fd[2]);
+ cpu_stat_print();
+ sleep(5);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index d54e91eb6cbf..b701b5c21342 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -20,6 +20,7 @@
#include <string.h>
#include <unistd.h>
#include <libgen.h>
+#include <sys/resource.h>
#include "bpf_load.h"
#include "bpf_util.h"
@@ -75,6 +76,7 @@ static void usage(const char *prog)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
const char *optstr = "SN";
char filename[256];
int ret, opt, key = 0;
@@ -98,6 +100,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
ifindex_in = strtoul(argv[optind], NULL, 0);
ifindex_out = strtoul(argv[optind + 1], NULL, 0);
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile
index 73f1da4d116c..9bf2881bd11b 100644
--- a/samples/sockmap/Makefile
+++ b/samples/sockmap/Makefile
@@ -2,7 +2,7 @@
hostprogs-y := sockmap
# Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o
+LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7c25c0c112bc..95a54a89a532 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -566,6 +566,7 @@ run:
else
fprintf(stderr, "unknown test\n");
out:
+ bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
close(s1);
close(s2);
close(p1);
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 5c43c187f27c..8567a858b789 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,12 +35,14 @@ TEST_GEN_PROGS_EXTENDED = test_libbpf_open
include ../lib.mk
-BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c
+BPFOBJ := $(OUTPUT)/libbpf.a
$(TEST_GEN_PROGS): $(BPFOBJ)
$(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
+$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+
.PHONY: force
# force a rebuild of BPFOBJ when its dependencies are updated
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
index 95a370f3d378..5d73db416460 100644
--- a/tools/testing/selftests/bpf/test_tcpbpf_user.c
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -11,6 +11,8 @@
#include <linux/ptrace.h>
#include <linux/bpf.h>
#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/resource.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -42,6 +44,7 @@ static int bpf_find_map(const char *test, struct bpf_object *obj,
int main(int argc, char **argv)
{
+ struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY };
const char *file = "test_tcpbpf_kern.o";
struct tcpbpf_globals g = {0};
int cg_fd, prog_fd, map_fd;
@@ -54,6 +57,9 @@ int main(int argc, char **argv)
int pid;
int rv;
+ if (setrlimit(RLIMIT_MEMLOCK, &limit) < 0)
+ perror("Unable to lift memlock rlimit");
+
if (argc > 1 && strcmp(argv[1], "-d") == 0)
debug_flag = true;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index c73592fa3d41..2164d218322e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -57,6 +57,9 @@
#define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0)
#define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1)
+#define UNPRIV_SYSCTL "kernel/unprivileged_bpf_disabled"
+static bool unpriv_disabled = false;
+
struct bpf_test {
const char *descr;
struct bpf_insn insns[MAX_INSNS];
@@ -11163,6 +11166,95 @@ static struct bpf_test tests[] = {
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
+ {
+ "jit: lsh, rsh, arsh by 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 0xff),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+ },
+ {
+ "jit: mov32 for ldimm64, 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_1, 0xfeffffffffffffffULL),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
+ BPF_LD_IMM64(BPF_REG_2, 0xfeffffffULL),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+ },
+ {
+ "jit: mov32 for ldimm64, 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 0x1ffffffffULL),
+ BPF_LD_IMM64(BPF_REG_2, 0xffffffffULL),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+ },
+ {
+ "jit: various mul tests",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+ BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+ BPF_LD_IMM64(BPF_REG_1, 0xefefefULL),
+ BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+ BPF_ALU64_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV32_REG(BPF_REG_2, BPF_REG_2),
+ BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_0, 0x952a7bbcULL),
+ BPF_LD_IMM64(BPF_REG_1, 0xfefefeULL),
+ BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_2, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+ },
+
};
static int probe_filter_length(const struct bpf_insn *fp)
@@ -11317,7 +11409,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
goto fail_log;
}
if (!strstr(bpf_vlog, expected_err) && !reject_from_alignment) {
- printf("FAIL\nUnexpected error message!\n");
+ printf("FAIL\nUnexpected error message!\n\tEXP: %s\n\tRES: %s\n",
+ expected_err, bpf_vlog);
goto fail_log;
}
}
@@ -11401,9 +11494,20 @@ out:
return ret;
}
+static void get_unpriv_disabled()
+{
+ char buf[2];
+ FILE *fd;
+
+ fd = fopen("/proc/sys/"UNPRIV_SYSCTL, "r");
+ if (fgets(buf, 2, fd) == buf && atoi(buf))
+ unpriv_disabled = true;
+ fclose(fd);
+}
+
static int do_test(bool unpriv, unsigned int from, unsigned int to)
{
- int i, passes = 0, errors = 0;
+ int i, passes = 0, errors = 0, skips = 0;
for (i = from; i < to; i++) {
struct bpf_test *test = &tests[i];
@@ -11411,7 +11515,10 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
/* Program types that are not supported by non-root we
* skip right away.
*/
- if (!test->prog_type) {
+ if (!test->prog_type && unpriv_disabled) {
+ printf("#%d/u %s SKIP\n", i, test->descr);
+ skips++;
+ } else if (!test->prog_type) {
if (!unpriv)
set_admin(false);
printf("#%d/u %s ", i, test->descr);
@@ -11420,13 +11527,17 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
set_admin(true);
}
- if (!unpriv) {
+ if (unpriv) {
+ printf("#%d/p %s SKIP\n", i, test->descr);
+ skips++;
+ } else {
printf("#%d/p %s ", i, test->descr);
do_test_single(test, false, &passes, &errors);
}
}
- printf("Summary: %d PASSED, %d FAILED\n", passes, errors);
+ printf("Summary: %d PASSED, %d SKIPPED, %d FAILED\n", passes,
+ skips, errors);
return errors ? EXIT_FAILURE : EXIT_SUCCESS;
}
@@ -11454,6 +11565,13 @@ int main(int argc, char **argv)
}
}
+ get_unpriv_disabled();
+ if (unpriv && unpriv_disabled) {
+ printf("Cannot run as unprivileged user with sysctl %s.\n",
+ UNPRIV_SYSCTL);
+ return EXIT_FAILURE;
+ }
+
setrlimit(RLIMIT_MEMLOCK, unpriv ? &rlim : &rinf);
return do_test(unpriv, from, to);
}