13 files changed, 779 insertions, 100 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 45e4eb5bcbb2..cbf94d44f3d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -61,7 +61,12 @@ static bool is_imm8(int value)
 
 static bool is_simm32(s64 value)
 {
-	return value == (s64) (s32) value;
+	return value == (s64)(s32)value;
+}
+
+static bool is_uimm32(u64 value)
+{
+	return value == (u64)(u32)value;
 }
 
 /* mov dst, src */
@@ -212,7 +217,7 @@ struct jit_context {
 /* emit x64 prologue code for BPF program and check it's size.
  * bpf_tail_call helper will skip it while jumping into another program
  */
-static void emit_prologue(u8 **pprog, u32 stack_depth)
+static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 {
 	u8 *prog = *pprog;
 	int cnt = 0;
@@ -247,18 +252,21 @@ static void emit_prologue(u8 **pprog, u32 stack_depth)
 	/* mov qword ptr [rbp+24],r15 */
 	EMIT4(0x4C, 0x89, 0x7D, 24);
 
-	/* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
-	 * we need to reset the counter to 0. It's done in two instructions,
-	 * resetting rax register to 0 (xor on eax gets 0 extended), and
-	 * moving it to the counter location.
-	 */
+	if (!ebpf_from_cbpf) {
+		/* Clear the tail call counter (tail_call_cnt): for eBPF tail
+		 * calls we need to reset the counter to 0. It's done in two
+		 * instructions, resetting rax register to 0, and moving it
+		 * to the counter location.
+		 */
+
+		/* xor eax, eax */
+		EMIT2(0x31, 0xc0);
+		/* mov qword ptr [rbp+32], rax */
+		EMIT4(0x48, 0x89, 0x45, 32);
 
-	/* xor eax, eax */
-	EMIT2(0x31, 0xc0);
-	/* mov qword ptr [rbp+32], rax */
-	EMIT4(0x48, 0x89, 0x45, 32);
+		BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	}
 
-	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
 	*pprog = prog;
 }
 
@@ -356,6 +364,86 @@ static void emit_load_skb_data_hlen(u8 **pprog)
 	*pprog = prog;
 }
 
+static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
+			   u32 dst_reg, const u32 imm32)
+{
+	u8 *prog = *pprog;
+	u8 b1, b2, b3;
+	int cnt = 0;
+
+	/* optimization: if imm32 is positive, use 'mov %eax, imm32'
+	 * (which zero-extends imm32) to save 2 bytes.
+	 */
+	if (sign_propagate && (s32)imm32 < 0) {
+		/* 'mov %rax, imm32' sign extends imm32 */
+		b1 = add_1mod(0x48, dst_reg);
+		b2 = 0xC7;
+		b3 = 0xC0;
+		EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
+		goto done;
+	}
+
+	/* optimization: if imm32 is zero, use 'xor %eax, %eax'
+	 * to save 3 bytes.
+	 */
+	if (imm32 == 0) {
+		if (is_ereg(dst_reg))
+			EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+		b2 = 0x31; /* xor */
+		b3 = 0xC0;
+		EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+		goto done;
+	}
+
+	/* mov %eax, imm32 */
+	if (is_ereg(dst_reg))
+		EMIT1(add_1mod(0x40, dst_reg));
+	EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+done:
+	*pprog = prog;
+}
+
+static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
+			   const u32 imm32_hi, const u32 imm32_lo)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+		/* For emitting plain u32, where sign bit must not be
+		 * propagated LLVM tends to load imm64 over mov32
+		 * directly, so save couple of bytes by just doing
+		 * 'mov %eax, imm32' instead.
+		 */
+		emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+	} else {
+		/* movabsq %rax, imm64 */
+		EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
+		EMIT(imm32_lo, 4);
+		EMIT(imm32_hi, 4);
+	}
+
+	*pprog = prog;
+}
+
+static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (is64) {
+		/* mov dst, src */
+		EMIT_mov(dst_reg, src_reg);
+	} else {
+		/* mov32 dst, src */
+		if (is_ereg(dst_reg) || is_ereg(src_reg))
+			EMIT1(add_2mod(0x40, dst_reg, src_reg));
+		EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+	}
+
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -369,7 +457,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	int proglen = 0;
 	u8 *prog = temp;
 
-	emit_prologue(&prog, bpf_prog->aux->stack_depth);
+	emit_prologue(&prog, bpf_prog->aux->stack_depth,
+		      bpf_prog_was_classic(bpf_prog));
 
 	if (seen_ld_abs)
 		emit_load_skb_data_hlen(&prog);
@@ -378,7 +467,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		const s32 imm32 = insn->imm;
 		u32 dst_reg = insn->dst_reg;
 		u32 src_reg = insn->src_reg;
-		u8 b1 = 0, b2 = 0, b3 = 0;
+		u8 b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
 		bool reload_skb_data;
@@ -414,16 +503,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
 			break;
 
-			/* mov dst, src */
 		case BPF_ALU64 | BPF_MOV | BPF_X:
-			EMIT_mov(dst_reg, src_reg);
-			break;
-
-			/* mov32 dst, src */
 		case BPF_ALU | BPF_MOV | BPF_X:
-			if (is_ereg(dst_reg) || is_ereg(src_reg))
-				EMIT1(add_2mod(0x40, dst_reg, src_reg));
-			EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+			emit_mov_reg(&prog,
+				     BPF_CLASS(insn->code) == BPF_ALU64,
+				     dst_reg, src_reg);
 			break;
 
 			/* neg dst */
@@ -486,58 +570,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			break;
 
 		case BPF_ALU64 | BPF_MOV | BPF_K:
-			/* optimization: if imm32 is positive,
-			 * use 'mov eax, imm32' (which zero-extends imm32)
-			 * to save 2 bytes
-			 */
-			if (imm32 < 0) {
-				/* 'mov rax, imm32' sign extends imm32 */
-				b1 = add_1mod(0x48, dst_reg);
-				b2 = 0xC7;
-				b3 = 0xC0;
-				EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
-				break;
-			}
-
 		case BPF_ALU | BPF_MOV | BPF_K:
-			/* optimization: if imm32 is zero, use 'xor <dst>,<dst>'
-			 * to save 3 bytes.
-			 */
-			if (imm32 == 0) {
-				if (is_ereg(dst_reg))
-					EMIT1(add_2mod(0x40, dst_reg, dst_reg));
-				b2 = 0x31; /* xor */
-				b3 = 0xC0;
-				EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
-				break;
-			}
-
-			/* mov %eax, imm32 */
-			if (is_ereg(dst_reg))
-				EMIT1(add_1mod(0x40, dst_reg));
-			EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+			emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64,
+				       dst_reg, imm32);
 			break;
 
 		case BPF_LD | BPF_IMM | BPF_DW:
-			/* optimization: if imm64 is zero, use 'xor <dst>,<dst>'
-			 * to save 7 bytes.
-			 */
-			if (insn[0].imm == 0 && insn[1].imm == 0) {
-				b1 = add_2mod(0x48, dst_reg, dst_reg);
-				b2 = 0x31; /* xor */
-				b3 = 0xC0;
-				EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg));
-
-				insn++;
-				i++;
-				break;
-			}
-
-			/* movabsq %rax, imm64 */
-			EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
-			EMIT(insn[0].imm, 4);
-			EMIT(insn[1].imm, 4);
-
+			emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm);
 			insn++;
 			i++;
 			break;
@@ -594,36 +633,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_MUL | BPF_X:
 		case BPF_ALU64 | BPF_MUL | BPF_K:
 		case BPF_ALU64 | BPF_MUL | BPF_X:
-			EMIT1(0x50); /* push rax */
-			EMIT1(0x52); /* push rdx */
+		{
+			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+
+			if (dst_reg != BPF_REG_0)
+				EMIT1(0x50); /* push rax */
+			if (dst_reg != BPF_REG_3)
+				EMIT1(0x52); /* push rdx */
 
 			/* mov r11, dst_reg */
 			EMIT_mov(AUX_REG, dst_reg);
 
 			if (BPF_SRC(insn->code) == BPF_X)
-				/* mov rax, src_reg */
-				EMIT_mov(BPF_REG_0, src_reg);
+				emit_mov_reg(&prog, is64, BPF_REG_0, src_reg);
 			else
-				/* mov rax, imm32 */
-				EMIT3_off32(0x48, 0xC7, 0xC0, imm32);
+				emit_mov_imm32(&prog, is64, BPF_REG_0, imm32);
 
-			if (BPF_CLASS(insn->code) == BPF_ALU64)
+			if (is64)
 				EMIT1(add_1mod(0x48, AUX_REG));
 			else if (is_ereg(AUX_REG))
 				EMIT1(add_1mod(0x40, AUX_REG));
 			/* mul(q) r11 */
 			EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
 
-			/* mov r11, rax */
-			EMIT_mov(AUX_REG, BPF_REG_0);
-
-			EMIT1(0x5A); /* pop rdx */
-			EMIT1(0x58); /* pop rax */
-
-			/* mov dst_reg, r11 */
-			EMIT_mov(dst_reg, AUX_REG);
+			if (dst_reg != BPF_REG_3)
+				EMIT1(0x5A); /* pop rdx */
+			if (dst_reg != BPF_REG_0) {
+				/* mov dst_reg, rax */
+				EMIT_mov(dst_reg, BPF_REG_0);
+				EMIT1(0x58); /* pop rax */
+			}
 			break;
-
+		}
 			/* shifts */
 		case BPF_ALU | BPF_LSH | BPF_K:
 		case BPF_ALU | BPF_RSH | BPF_K:
@@ -641,7 +682,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			case BPF_RSH: b3 = 0xE8; break;
 			case BPF_ARSH: b3 = 0xF8; break;
 			}
-			EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
+
+			if (imm32 == 1)
+				EMIT2(0xD1, add_1reg(b3, dst_reg));
+			else
+				EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
 			break;
 
 		case BPF_ALU | BPF_LSH | BPF_X:
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a7f16e0f8d68..8a4566691c8f 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -96,7 +96,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled && sk) {					       \
+	if (cgroup_bpf_enabled) {					       \
 		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
 						 BPF_CGROUP_INET_SOCK_CREATE); \
 	}								       \
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 276932d75975..fdb691b520c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -20,7 +20,6 @@
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
 
-#include <net/xdp.h>
 #include <net/sch_generic.h>
 
 #include <uapi/linux/filter.h>
@@ -30,6 +29,7 @@ struct sk_buff;
 struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
+struct xdp_rxq_info;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5fb69a85d967..3c74b163eaeb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -508,10 +508,6 @@ err:
 static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
-#define CALLEE_SAVED_REGS 5
-static const int callee_saved[CALLEE_SAVED_REGS] = {
-	BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
-};
 
 static void __mark_reg_not_init(struct bpf_reg_state *reg);
 
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..2c2a587e0942 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += cpustat
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,7 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +146,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += cpustat_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +191,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_cpustat += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
new file mode 100644
index 000000000000..68c84da065b1
--- /dev/null
+++ b/samples/bpf/cpustat_kern.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/*
+ * The CPU number, cstate number and pstate number are based
+ * on 96boards Hikey with octa CA53 CPUs.
+ *
+ * Every CPU have three idle states for cstate:
+ *   WFI, CPU_OFF, CLUSTER_OFF
+ *
+ * Every CPU have 5 operating points:
+ *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
+ *
+ * This code is based on these assumption and other platforms
+ * need to adjust these definitions.
+ */
+#define MAX_CPU			8
+#define MAX_PSTATE_ENTRIES	5
+#define MAX_CSTATE_ENTRIES	3
+
+static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
+
+/*
+ * my_map structure is used to record cstate and pstate index and
+ * timestamp (Idx, Ts), when new event incoming we need to update
+ * combination for new state index and timestamp (Idx`, Ts`).
+ *
+ * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
+ * interval for the previous state: Duration(Idx) = Ts` - Ts.
+ *
+ * Every CPU has one below array for recording state index and
+ * timestamp, and record for cstate and pstate saperately:
+ *
+ * +--------------------------+
+ * | cstate timestamp         |
+ * +--------------------------+
+ * | cstate index             |
+ * +--------------------------+
+ * | pstate timestamp         |
+ * +--------------------------+
+ * | pstate index             |
+ * +--------------------------+
+ */
+#define MAP_OFF_CSTATE_TIME	0
+#define MAP_OFF_CSTATE_IDX	1
+#define MAP_OFF_PSTATE_TIME	2
+#define MAP_OFF_PSTATE_IDX	3
+#define MAP_OFF_NUM		4
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_CPU * MAP_OFF_NUM,
+};
+
+/* cstate_duration records duration time for every idle state per CPU */
+struct bpf_map_def SEC("maps") cstate_duration = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
+};
+
+/* pstate_duration records duration time for every operating point per CPU */
+struct bpf_map_def SEC("maps") pstate_duration = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
+};
+
+/*
+ * The trace events for cpu_idle and cpu_frequency are taken from:
+ * /sys/kernel/debug/tracing/events/power/cpu_idle/format
+ * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
+ *
+ * These two events have same format, so define one common structure.
+ */
+struct cpu_args {
+	u64 pad;
+	u32 state;
+	u32 cpu_id;
+};
+
+/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
+static u32 find_cpu_pstate_idx(u32 frequency)
+{
+	u32 i;
+
+	for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
+		if (frequency == cpu_opps[i])
+			return i;
+	}
+
+	return i;
+}
+
+SEC("tracepoint/power/cpu_idle")
+int bpf_prog1(struct cpu_args *ctx)
+{
+	u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+	u32 key, cpu, pstate_idx;
+	u64 *val;
+
+	if (ctx->cpu_id > MAX_CPU)
+		return 0;
+
+	cpu = ctx->cpu_id;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
+	cts = bpf_map_lookup_elem(&my_map, &key);
+	if (!cts)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+	cstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!cstate)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+	pts = bpf_map_lookup_elem(&my_map, &key);
+	if (!pts)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+	pstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!pstate)
+		return 0;
+
+	prev_state = *cstate;
+	*cstate = ctx->state;
+
+	if (!*cts) {
+		*cts = bpf_ktime_get_ns();
+		return 0;
+	}
+
+	cur_ts = bpf_ktime_get_ns();
+	delta = cur_ts - *cts;
+	*cts = cur_ts;
+
+	/*
+	 * When state doesn't equal to (u32)-1, the cpu will enter
+	 * one idle state; for this case we need to record interval
+	 * for the pstate.
+	 *
+	 *                 OPP2
+	 *            +---------------------+
+	 *     OPP1   |                     |
+	 *   ---------+                     |
+	 *                                  |  Idle state
+	 *                                  +---------------
+	 *
+	 *            |<- pstate duration ->|
+	 *            ^                     ^
+	 *           pts                  cur_ts
+	 */
+	if (ctx->state != (u32)-1) {
+
+		/* record pstate after have first cpu_frequency event */
+		if (!*pts)
+			return 0;
+
+		delta = cur_ts - *pts;
+
+		pstate_idx = find_cpu_pstate_idx(*pstate);
+		if (pstate_idx >= MAX_PSTATE_ENTRIES)
+			return 0;
+
+		key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+		val = bpf_map_lookup_elem(&pstate_duration, &key);
+		if (val)
+			__sync_fetch_and_add((long *)val, delta);
+
+	/*
+	 * When state equal to (u32)-1, the cpu just exits from one
+	 * specific idle state; for this case we need to record
+	 * interval for the pstate.
+	 *
+	 *       OPP2
+	 *   -----------+
+	 *              |                          OPP1
+	 *              |                     +-----------
+	 *              |     Idle state      |
+	 *              +---------------------+
+	 *
+	 *              |<- cstate duration ->|
+	 *              ^                     ^
+	 *             cts                  cur_ts
+	 */
+	} else {
+
+		key = cpu * MAX_CSTATE_ENTRIES + prev_state;
+		val = bpf_map_lookup_elem(&cstate_duration, &key);
+		if (val)
+			__sync_fetch_and_add((long *)val, delta);
+	}
+
+	/* Update timestamp for pstate as new start time */
+	if (*pts)
+		*pts = cur_ts;
+
+	return 0;
+}
+
+SEC("tracepoint/power/cpu_frequency")
+int bpf_prog2(struct cpu_args *ctx)
+{
+	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+	u32 key, cpu, pstate_idx;
+	u64 *val;
+
+	cpu = ctx->cpu_id;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+	pts = bpf_map_lookup_elem(&my_map, &key);
+	if (!pts)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+	pstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!pstate)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+	cstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!cstate)
+		return 0;
+
+	prev_state = *pstate;
+	*pstate = ctx->state;
+
+	if (!*pts) {
+		*pts = bpf_ktime_get_ns();
+		return 0;
+	}
+
+	cur_ts = bpf_ktime_get_ns();
+	delta = cur_ts - *pts;
+	*pts = cur_ts;
+
+	/* When CPU is in idle, bail out to skip pstate statistics */
+	if (*cstate != (u32)(-1))
+		return 0;
+
+	/*
+	 * The cpu changes to another different OPP (in below diagram
+	 * change frequency from OPP3 to OPP1), need recording interval
+	 * for previous frequency OPP3 and update timestamp as start
+	 * time for new frequency OPP1.
+	 *
+	 *                 OPP3
+	 *            +---------------------+
+	 *     OPP2   |                     |
+	 *   ---------+                     |
+	 *                                  |    OPP1
+	 *                                  +---------------
+	 *
+	 *            |<- pstate duration ->|
+	 *            ^                     ^
+	 *           pts                  cur_ts
+	 */
+	pstate_idx = find_cpu_pstate_idx(*pstate);
+	if (pstate_idx >= MAX_PSTATE_ENTRIES)
+		return 0;
+
+	key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+	val = bpf_map_lookup_elem(&pstate_duration, &key);
+	if (val)
+		__sync_fetch_and_add((long *)val, delta);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
new file mode 100644
index 000000000000..2b4cd1ae57c5
--- /dev/null
+++ b/samples/bpf/cpustat_user.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_CPU			8
+#define MAX_PSTATE_ENTRIES	5
+#define MAX_CSTATE_ENTRIES	3
+#define MAX_STARS		40
+
+#define CPUFREQ_MAX_SYSFS_PATH	"/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"
+#define CPUFREQ_LOWEST_FREQ	"208000"
+#define CPUFREQ_HIGHEST_FREQ	"12000000"
+
+struct cpu_stat_data {
+	unsigned long cstate[MAX_CSTATE_ENTRIES];
+	unsigned long pstate[MAX_PSTATE_ENTRIES];
+};
+
+static struct cpu_stat_data stat_data[MAX_CPU];
+
+static void cpu_stat_print(void)
+{
+	int i, j;
+	char state_str[sizeof("cstate-9")];
+	struct cpu_stat_data *data;
+
+	/* Clear screen */
+	printf("\033[2J");
+
+	/* Header */
+	printf("\nCPU states statistics:\n");
+	printf("%-10s ", "state(ms)");
+
+	for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+		sprintf(state_str, "cstate-%d", i);
+		printf("%-11s ", state_str);
+	}
+
+	for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+		sprintf(state_str, "pstate-%d", i);
+		printf("%-11s ", state_str);
+	}
+
+	printf("\n");
+
+	for (j = 0; j < MAX_CPU; j++) {
+		data = &stat_data[j];
+
+		printf("CPU-%-6d ", j);
+		for (i = 0; i < MAX_CSTATE_ENTRIES; i++)
+			printf("%-11ld ", data->cstate[i] / 1000000);
+
+		for (i = 0; i < MAX_PSTATE_ENTRIES; i++)
+			printf("%-11ld ", data->pstate[i] / 1000000);
+
+		printf("\n");
+	}
+}
+
+static void cpu_stat_update(int cstate_fd, int pstate_fd)
+{
+	unsigned long key, value;
+	int c, i;
+
+	for (c = 0; c < MAX_CPU; c++) {
+		for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+			key = c * MAX_CSTATE_ENTRIES + i;
+			bpf_map_lookup_elem(cstate_fd, &key, &value);
+			stat_data[c].cstate[i] = value;
+		}
+
+		for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+			key = c * MAX_PSTATE_ENTRIES + i;
+			bpf_map_lookup_elem(pstate_fd, &key, &value);
+			stat_data[c].pstate[i] = value;
+		}
+	}
+}
+
+/*
+ * This function is copied from 'idlestat' tool function
+ * idlestat_wake_all() in idlestate.c.
+ *
+ * It sets the self running task affinity to cpus one by one so can wake up
+ * the specific CPU to handle scheduling; this results in all cpus can be
+ * waken up once and produce ftrace event 'trace_cpu_idle'.
+ */
+static int cpu_stat_inject_cpu_idle_event(void)
+{
+	int rcpu, i, ret;
+	cpu_set_t cpumask;
+	cpu_set_t original_cpumask;
+
+	ret = sysconf(_SC_NPROCESSORS_CONF);
+	if (ret < 0)
+		return -1;
+
+	rcpu = sched_getcpu();
+	if (rcpu < 0)
+		return -1;
+
+	/* Keep track of the CPUs we will run on */
+	sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask);
+
+	for (i = 0; i < ret; i++) {
+
+		/* Pointless to wake up ourself */
+		if (i == rcpu)
+			continue;
+
+		/* Pointless to wake CPUs we will not run on */
+		if (!CPU_ISSET(i, &original_cpumask))
+			continue;
+
+		CPU_ZERO(&cpumask);
+		CPU_SET(i, &cpumask);
+
+		sched_setaffinity(0, sizeof(cpumask), &cpumask);
+	}
+
+	/* Enable all the CPUs of the original mask */
+	sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask);
+	return 0;
+}
+
+/*
+ * It's possible to have no any frequency change for long time and cannot
+ * get ftrace event 'trace_cpu_frequency' for long period, this introduces
+ * big deviation for pstate statistics.
+ *
+ * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz
+ * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to
+ * the maximum frequency value 1.2GHz.
+ */
+static int cpu_stat_inject_cpu_frequency_event(void)
+{
+	int len, fd;
+
+	fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY);
+	if (fd < 0) {
+		printf("failed to open scaling_max_freq, errno=%d\n", errno);
+		return fd;
+	}
+
+	len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ));
+	if (len < 0) {
+		printf("failed to open scaling_max_freq, errno=%d\n", errno);
+		goto err;
+	}
+
+	len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ));
+	if (len < 0) {
+		printf("failed to open scaling_max_freq, errno=%d\n", errno);
+		goto err;
+	}
+
+err:
+	close(fd);
+	return len;
+}
+
+static void int_exit(int sig)
+{
+	cpu_stat_inject_cpu_idle_event();
+	cpu_stat_inject_cpu_frequency_event();
+	cpu_stat_update(map_fd[1], map_fd[2]);
+	cpu_stat_print();
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+	int ret;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	ret = cpu_stat_inject_cpu_idle_event();
+	if (ret < 0)
+		return 1;
+
+	ret = cpu_stat_inject_cpu_frequency_event();
+	if (ret < 0)
+		return 1;
+
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	while (1) {
+		cpu_stat_update(map_fd[1], map_fd[2]);
+		cpu_stat_print();
+		sleep(5);
+	}
+
+	return 0;
+}
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index d54e91eb6cbf..b701b5c21342 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -20,6 +20,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <libgen.h>
+#include <sys/resource.h>
 
 #include "bpf_load.h"
 #include "bpf_util.h"
@@ -75,6 +76,7 @@ static void usage(const char *prog)
 
 int main(int argc, char **argv)
 {
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	const char *optstr = "SN";
 	char filename[256];
 	int ret, opt, key = 0;
@@ -98,6 +100,11 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	ifindex_in = strtoul(argv[optind], NULL, 0);
 	ifindex_out = strtoul(argv[optind + 1], NULL, 0);
 	printf("input: %d output: %d\n", ifindex_in, ifindex_out);
diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile
index 73f1da4d116c..9bf2881bd11b 100644
--- a/samples/sockmap/Makefile
+++ b/samples/sockmap/Makefile
@@ -2,7 +2,7 @@
 hostprogs-y := sockmap
 
 # Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o
+LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7c25c0c112bc..95a54a89a532 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -566,6 +566,7 @@ run:
 	else
 		fprintf(stderr, "unknown test\n");
 out:
+	bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
 	close(s1);
 	close(s2);
 	close(p1);
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 5c43c187f27c..8567a858b789 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,12 +35,14 @@ TEST_GEN_PROGS_EXTENDED = test_libbpf_open
 
 include ../lib.mk
 
-BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c
+BPFOBJ := $(OUTPUT)/libbpf.a
 
 $(TEST_GEN_PROGS): $(BPFOBJ)
 
 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 
+$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+
 .PHONY: force
 
 # force a rebuild of BPFOBJ when its dependencies are updated
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
index 95a370f3d378..5d73db416460 100644
--- a/tools/testing/selftests/bpf/test_tcpbpf_user.c
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -11,6 +11,8 @@
 #include <linux/ptrace.h>
 #include <linux/bpf.h>
 #include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -42,6 +44,7 @@ static int bpf_find_map(const char *test, struct bpf_object *obj,
 
 int main(int argc, char **argv)
 {
+	struct rlimit limit  = { RLIM_INFINITY, RLIM_INFINITY };
 	const char *file = "test_tcpbpf_kern.o";
 	struct tcpbpf_globals g = {0};
 	int cg_fd, prog_fd, map_fd;
@@ -54,6 +57,9 @@ int main(int argc, char **argv)
 	int pid;
 	int rv;
 
+	if (setrlimit(RLIMIT_MEMLOCK, &limit) < 0)
+		perror("Unable to lift memlock rlimit");
+
 	if (argc > 1 && strcmp(argv[1], "-d") == 0)
 		debug_flag = true;
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index c73592fa3d41..2164d218322e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -57,6 +57,9 @@
 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS	(1 << 0)
 #define F_LOAD_WITH_STRICT_ALIGNMENT		(1 << 1)
 
+#define UNPRIV_SYSCTL "kernel/unprivileged_bpf_disabled"
+static bool unpriv_disabled = false;
+
 struct bpf_test {
 	const char *descr;
 	struct bpf_insn	insns[MAX_INSNS];
@@ -11163,6 +11166,95 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"jit: lsh, rsh, arsh by 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_MOV64_IMM(BPF_REG_1, 0xff),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
+			BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
+			BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"jit: mov32 for ldimm64, 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_LD_IMM64(BPF_REG_1, 0xfeffffffffffffffULL),
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
+			BPF_LD_IMM64(BPF_REG_2, 0xfeffffffULL),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"jit: mov32 for ldimm64, 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_LD_IMM64(BPF_REG_1, 0x1ffffffffULL),
+			BPF_LD_IMM64(BPF_REG_2, 0xffffffffULL),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"jit: various mul tests",
+		.insns = {
+			BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+			BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+			BPF_LD_IMM64(BPF_REG_1, 0xefefefULL),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_REG(BPF_REG_2, BPF_REG_2),
+			BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+			BPF_ALU32_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+			BPF_ALU32_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LD_IMM64(BPF_REG_0, 0x952a7bbcULL),
+			BPF_LD_IMM64(BPF_REG_1, 0xfefefeULL),
+			BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+			BPF_ALU32_REG(BPF_MUL, BPF_REG_2, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -11317,7 +11409,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 			goto fail_log;
 		}
 		if (!strstr(bpf_vlog, expected_err) && !reject_from_alignment) {
-			printf("FAIL\nUnexpected error message!\n");
+			printf("FAIL\nUnexpected error message!\n\tEXP: %s\n\tRES: %s\n",
+			      expected_err, bpf_vlog);
 			goto fail_log;
 		}
 	}
@@ -11401,9 +11494,20 @@ out:
 	return ret;
 }
 
+static void get_unpriv_disabled()
+{
+	char buf[2];
+	FILE *fd;
+
+	fd = fopen("/proc/sys/"UNPRIV_SYSCTL, "r");
+	if (fgets(buf, 2, fd) == buf && atoi(buf))
+		unpriv_disabled = true;
+	fclose(fd);
+}
+
 static int do_test(bool unpriv, unsigned int from, unsigned int to)
 {
-	int i, passes = 0, errors = 0;
+	int i, passes = 0, errors = 0, skips = 0;
 
 	for (i = from; i < to; i++) {
 		struct bpf_test *test = &tests[i];
@@ -11411,7 +11515,10 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
 		/* Program types that are not supported by non-root we
 		 * skip right away.
 		 */
-		if (!test->prog_type) {
+		if (!test->prog_type && unpriv_disabled) {
+			printf("#%d/u %s SKIP\n", i, test->descr);
+			skips++;
+		} else if (!test->prog_type) {
 			if (!unpriv)
 				set_admin(false);
 			printf("#%d/u %s ", i, test->descr);
@@ -11420,13 +11527,17 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
 				set_admin(true);
 		}
 
-		if (!unpriv) {
+		if (unpriv) {
+			printf("#%d/p %s SKIP\n", i, test->descr);
+			skips++;
+		} else {
 			printf("#%d/p %s ", i, test->descr);
 			do_test_single(test, false, &passes, &errors);
 		}
 	}
 
-	printf("Summary: %d PASSED, %d FAILED\n", passes, errors);
+	printf("Summary: %d PASSED, %d SKIPPED, %d FAILED\n", passes,
+	       skips, errors);
 	return errors ? EXIT_FAILURE : EXIT_SUCCESS;
 }
 
@@ -11454,6 +11565,13 @@ int main(int argc, char **argv)
 		}
 	}
 
+	get_unpriv_disabled();
+	if (unpriv && unpriv_disabled) {
+		printf("Cannot run as unprivileged user with sysctl %s.\n",
+		       UNPRIV_SYSCTL);
+		return EXIT_FAILURE;
+	}
+
 	setrlimit(RLIMIT_MEMLOCK, unpriv ? &rlim : &rinf);
 	return do_test(unpriv, from, to);
 }