110 files changed, 6901 insertions, 2721 deletions
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 97dc386e3cb8..cc29869d12a3 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1260,12 +1260,9 @@ static inline void emit_push_r64(const s8 src[], struct jit_ctx *ctx)
 
 static void build_prologue(struct jit_ctx *ctx)
 {
-	const s8 r0 = bpf2a32[BPF_REG_0][1];
-	const s8 r2 = bpf2a32[BPF_REG_1][1];
-	const s8 r3 = bpf2a32[BPF_REG_1][0];
-	const s8 r4 = bpf2a32[BPF_REG_6][1];
-	const s8 fplo = bpf2a32[BPF_REG_FP][1];
-	const s8 fphi = bpf2a32[BPF_REG_FP][0];
+	const s8 arm_r0 = bpf2a32[BPF_REG_0][1];
+	const s8 *bpf_r1 = bpf2a32[BPF_REG_1];
+	const s8 *bpf_fp = bpf2a32[BPF_REG_FP];
 	const s8 *tcc = bpf2a32[TCALL_CNT];
 
 	/* Save callee saved registers. */
@@ -1278,8 +1275,10 @@ static void build_prologue(struct jit_ctx *ctx)
 	emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx);
 	emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx);
 #endif
-	/* Save frame pointer for later */
-	emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx);
+	/* mov r3, #0 */
+	/* sub r2, sp, #SCRATCH_SIZE */
+	emit(ARM_MOV_I(bpf_r1[0], 0), ctx);
+	emit(ARM_SUB_I(bpf_r1[1], ARM_SP, SCRATCH_SIZE), ctx);
 
 	ctx->stack_size = imm8m(STACK_SIZE);
 
@@ -1287,18 +1286,15 @@ static void build_prologue(struct jit_ctx *ctx)
 	emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx);
 
 	/* Set up BPF prog stack base register */
-	emit_a32_mov_r(fplo, ARM_IP, ctx);
-	emit_a32_mov_i(fphi, 0, ctx);
+	emit_a32_mov_r64(true, bpf_fp, bpf_r1, ctx);
 
-	/* mov r4, 0 */
-	emit(ARM_MOV_I(r4, 0), ctx);
+	/* Initialize Tail Count */
+	emit(ARM_MOV_I(bpf_r1[1], 0), ctx);
+	emit_a32_mov_r64(true, tcc, bpf_r1, ctx);
 
 	/* Move BPF_CTX to BPF_R1 */
-	emit(ARM_MOV_R(r3, r4), ctx);
-	emit(ARM_MOV_R(r2, r0), ctx);
-	/* Initialize Tail Count */
-	emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[0])), ctx);
-	emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[1])), ctx);
+	emit(ARM_MOV_R(bpf_r1[1], arm_r0), ctx);
+
 	/* end of prologue */
 }
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1b4476ddb83..29d03459de20 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -69,6 +69,7 @@ config ARM64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
diff --git a/arch/riscv/include/asm/perf_event.h b/arch/riscv/include/asm/perf_event.h
index aefbfaa6a781..0234048b12bc 100644
--- a/arch/riscv/include/asm/perf_event.h
+++ b/arch/riscv/include/asm/perf_event.h
@@ -82,4 +82,8 @@ struct riscv_pmu {
 	int		irq;
 };
 
+#ifdef CONFIG_PERF_EVENTS
+#define perf_arch_bpf_user_pt_regs(regs) (struct user_regs_struct *)regs
+#endif
+
 #endif /* _ASM_RISCV_PERF_EVENT_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 36ae01761352..f66b87314fa2 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -94,6 +94,10 @@ extern pgd_t swapper_pg_dir[];
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
 
+#define BPF_JIT_REGION_SIZE	(SZ_128M)
+#define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)
+#define BPF_JIT_REGION_END	(VMALLOC_END)
+
 /*
  * Roughly size the vmemmap space to be large enough to fit enough
  * struct pages to map half the virtual address space. Then
diff --git a/arch/riscv/include/uapi/asm/bpf_perf_event.h b/arch/riscv/include/uapi/asm/bpf_perf_event.h
new file mode 100644
index 000000000000..6cb1c2823288
--- /dev/null
+++ b/arch/riscv/include/uapi/asm/bpf_perf_event.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__
+#define _UAPI__ASM_BPF_PERF_EVENT_H__
+
+#include <asm/ptrace.h>
+
+typedef struct user_regs_struct bpf_user_pt_regs_t;
+
+#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */
diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c
index 7fbf56aab661..483f4ad7f4dc 100644
--- a/arch/riscv/net/bpf_jit_comp.c
+++ b/arch/riscv/net/bpf_jit_comp.c
@@ -120,6 +120,11 @@ static bool seen_reg(int reg, struct rv_jit_context *ctx)
 	return false;
 }
 
+static void mark_fp(struct rv_jit_context *ctx)
+{
+	__set_bit(RV_CTX_F_SEEN_S5, &ctx->flags);
+}
+
 static void mark_call(struct rv_jit_context *ctx)
 {
 	__set_bit(RV_CTX_F_SEEN_CALL, &ctx->flags);
@@ -456,6 +461,11 @@ static u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
 	return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f);
 }
 
+static u32 rv_auipc(u8 rd, u32 imm31_12)
+{
+	return rv_u_insn(imm31_12, rd, 0x17);
+}
+
 static bool is_12b_int(s64 val)
 {
 	return -(1 << 11) <= val && val < (1 << 11);
@@ -479,27 +489,7 @@ static bool is_32b_int(s64 val)
 static int is_12b_check(int off, int insn)
 {
 	if (!is_12b_int(off)) {
-		pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n",
-		       insn, (int)off);
-		return -1;
-	}
-	return 0;
-}
-
-static int is_13b_check(int off, int insn)
-{
-	if (!is_13b_int(off)) {
-		pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n",
-		       insn, (int)off);
-		return -1;
-	}
-	return 0;
-}
-
-static int is_21b_check(int off, int insn)
-{
-	if (!is_21b_int(off)) {
-		pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n",
+		pr_err("bpf-jit: insn=%d 12b < offset=%d not supported yet!\n",
 		       insn, (int)off);
 		return -1;
 	}
@@ -545,10 +535,13 @@ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
 		emit(rv_addi(rd, rd, lower), ctx);
 }
 
-static int rv_offset(int bpf_to, int bpf_from, struct rv_jit_context *ctx)
+static int rv_offset(int insn, int off, struct rv_jit_context *ctx)
 {
-	int from = ctx->offset[bpf_from] - 1, to = ctx->offset[bpf_to];
+	int from, to;
 
+	off++; /* BPF branch is from PC+1, RV is from PC */
+	from = (insn > 0) ? ctx->offset[insn - 1] : 0;
+	to = (insn + off > 0) ? ctx->offset[insn + off - 1] : 0;
 	return (to - from) << 2;
 }
 
@@ -559,7 +552,7 @@ static int epilogue_offset(struct rv_jit_context *ctx)
 	return (to - from) << 2;
 }
 
-static void __build_epilogue(u8 reg, struct rv_jit_context *ctx)
+static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
 {
 	int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
 
@@ -596,8 +589,114 @@ static void __build_epilogue(u8 reg, struct rv_jit_context *ctx)
 
 	emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
 	/* Set return value. */
-	emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
-	emit(rv_jalr(RV_REG_ZERO, reg, 0), ctx);
+	if (!is_tail_call)
+		emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
+	emit(rv_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
+		     is_tail_call ? 4 : 0), /* skip TCC init */
+	     ctx);
+}
+
+/* return -1 or inverted cond */
+static int invert_bpf_cond(u8 cond)
+{
+	switch (cond) {
+	case BPF_JEQ:
+		return BPF_JNE;
+	case BPF_JGT:
+		return BPF_JLE;
+	case BPF_JLT:
+		return BPF_JGE;
+	case BPF_JGE:
+		return BPF_JLT;
+	case BPF_JLE:
+		return BPF_JGT;
+	case BPF_JNE:
+		return BPF_JEQ;
+	case BPF_JSGT:
+		return BPF_JSLE;
+	case BPF_JSLT:
+		return BPF_JSGE;
+	case BPF_JSGE:
+		return BPF_JSLT;
+	case BPF_JSLE:
+		return BPF_JSGT;
+	}
+	return -1;
+}
+
+static void emit_bcc(u8 cond, u8 rd, u8 rs, int rvoff,
+		     struct rv_jit_context *ctx)
+{
+	switch (cond) {
+	case BPF_JEQ:
+		emit(rv_beq(rd, rs, rvoff >> 1), ctx);
+		return;
+	case BPF_JGT:
+		emit(rv_bltu(rs, rd, rvoff >> 1), ctx);
+		return;
+	case BPF_JLT:
+		emit(rv_bltu(rd, rs, rvoff >> 1), ctx);
+		return;
+	case BPF_JGE:
+		emit(rv_bgeu(rd, rs, rvoff >> 1), ctx);
+		return;
+	case BPF_JLE:
+		emit(rv_bgeu(rs, rd, rvoff >> 1), ctx);
+		return;
+	case BPF_JNE:
+		emit(rv_bne(rd, rs, rvoff >> 1), ctx);
+		return;
+	case BPF_JSGT:
+		emit(rv_blt(rs, rd, rvoff >> 1), ctx);
+		return;
+	case BPF_JSLT:
+		emit(rv_blt(rd, rs, rvoff >> 1), ctx);
+		return;
+	case BPF_JSGE:
+		emit(rv_bge(rd, rs, rvoff >> 1), ctx);
+		return;
+	case BPF_JSLE:
+		emit(rv_bge(rs, rd, rvoff >> 1), ctx);
+	}
+}
+
+static void emit_branch(u8 cond, u8 rd, u8 rs, int rvoff,
+			struct rv_jit_context *ctx)
+{
+	s64 upper, lower;
+
+	if (is_13b_int(rvoff)) {
+		emit_bcc(cond, rd, rs, rvoff, ctx);
+		return;
+	}
+
+	/* Adjust for jal */
+	rvoff -= 4;
+
+	/* Transform, e.g.:
+	 *   bne rd,rs,foo
+	 * to
+	 *   beq rd,rs,<.L1>
+	 *   (auipc foo)
+	 *   jal(r) foo
+	 * .L1
+	 */
+	cond = invert_bpf_cond(cond);
+	if (is_21b_int(rvoff)) {
+		emit_bcc(cond, rd, rs, 8, ctx);
+		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		return;
+	}
+
+	/* 32b No need for an additional rvoff adjustment, since we
+	 * get that from the auipc at PC', where PC = PC' + 4.
+	 */
+	upper = (rvoff + (1 << 11)) >> 12;
+	lower = rvoff & 0xfff;
+
+	emit_bcc(cond, rd, rs, 12, ctx);
+	emit(rv_auipc(RV_REG_T1, upper), ctx);
+	emit(rv_jalr(RV_REG_ZERO, RV_REG_T1, lower), ctx);
 }
 
 static void emit_zext_32(u8 reg, struct rv_jit_context *ctx)
@@ -627,18 +726,14 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
 		return -1;
 	emit(rv_lwu(RV_REG_T1, off, RV_REG_A1), ctx);
 	off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
-	if (is_13b_check(off, insn))
-		return -1;
-	emit(rv_bgeu(RV_REG_A2, RV_REG_T1, off >> 1), ctx);
+	emit_branch(BPF_JGE, RV_REG_A2, RV_REG_T1, off, ctx);
 
 	/* if (TCC-- < 0)
 	 *     goto out;
 	 */
 	emit(rv_addi(RV_REG_T1, tcc, -1), ctx);
 	off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
-	if (is_13b_check(off, insn))
-		return -1;
-	emit(rv_blt(tcc, RV_REG_ZERO, off >> 1), ctx);
+	emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx);
 
 	/* prog = array->ptrs[index];
 	 * if (!prog)
@@ -651,18 +746,15 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
 		return -1;
 	emit(rv_ld(RV_REG_T2, off, RV_REG_T2), ctx);
 	off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
-	if (is_13b_check(off, insn))
-		return -1;
-	emit(rv_beq(RV_REG_T2, RV_REG_ZERO, off >> 1), ctx);
+	emit_branch(BPF_JEQ, RV_REG_T2, RV_REG_ZERO, off, ctx);
 
 	/* goto *(prog->bpf_func + 4); */
 	off = offsetof(struct bpf_prog, bpf_func);
 	if (is_12b_check(off, insn))
 		return -1;
 	emit(rv_ld(RV_REG_T3, off, RV_REG_T2), ctx);
-	emit(rv_addi(RV_REG_T3, RV_REG_T3, 4), ctx);
 	emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx);
-	__build_epilogue(RV_REG_T3, ctx);
+	__build_epilogue(true, ctx);
 	return 0;
 }
 
@@ -687,13 +779,6 @@ static void init_regs(u8 *rd, u8 *rs, const struct bpf_insn *insn,
 		*rs = bpf_to_rv_reg(insn->src_reg, ctx);
 }
 
-static int rv_offset_check(int *rvoff, s16 off, int insn,
-			   struct rv_jit_context *ctx)
-{
-	*rvoff = rv_offset(insn + off, insn, ctx);
-	return is_13b_check(*rvoff, insn);
-}
-
 static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
 {
 	emit(rv_addi(RV_REG_T2, *rd, 0), ctx);
@@ -726,13 +811,57 @@ static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx)
 	*rd = RV_REG_T2;
 }
 
+static void emit_jump_and_link(u8 rd, s64 rvoff, bool force_jalr,
+			       struct rv_jit_context *ctx)
+{
+	s64 upper, lower;
+
+	if (rvoff && is_21b_int(rvoff) && !force_jalr) {
+		emit(rv_jal(rd, rvoff >> 1), ctx);
+		return;
+	}
+
+	upper = (rvoff + (1 << 11)) >> 12;
+	lower = rvoff & 0xfff;
+	emit(rv_auipc(RV_REG_T1, upper), ctx);
+	emit(rv_jalr(rd, RV_REG_T1, lower), ctx);
+}
+
+static bool is_signed_bpf_cond(u8 cond)
+{
+	return cond == BPF_JSGT || cond == BPF_JSLT ||
+		cond == BPF_JSGE || cond == BPF_JSLE;
+}
+
+static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx)
+{
+	s64 off = 0;
+	u64 ip;
+	u8 rd;
+
+	if (addr && ctx->insns) {
+		ip = (u64)(long)(ctx->insns + ctx->ninsns);
+		off = addr - ip;
+		if (!is_32b_int(off)) {
+			pr_err("bpf-jit: target call addr %pK is out of range\n",
+			       (void *)addr);
+			return -ERANGE;
+		}
+	}
+
+	emit_jump_and_link(RV_REG_RA, off, !fixed, ctx);
+	rd = bpf_to_rv_reg(BPF_REG_0, ctx);
+	emit(rv_addi(rd, RV_REG_A0, 0), ctx);
+	return 0;
+}
+
 static int emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		     bool extra_pass)
 {
 	bool is64 = BPF_CLASS(insn->code) == BPF_ALU64 ||
 		    BPF_CLASS(insn->code) == BPF_JMP;
+	int s, e, rvoff, i = insn - ctx->prog->insnsi;
 	struct bpf_prog_aux *aux = ctx->prog->aux;
-	int rvoff, i = insn - ctx->prog->insnsi;
 	u8 rd = -1, rs = -1, code = insn->code;
 	s16 off = insn->off;
 	s32 imm = insn->imm;
@@ -1000,214 +1129,110 @@ out_be:
 
 	/* JUMP off */
 	case BPF_JMP | BPF_JA:
-		rvoff = rv_offset(i + off, i, ctx);
-		if (!is_21b_int(rvoff)) {
-			pr_err("bpf-jit: insn=%d offset=%d not supported yet!\n",
-			       i, rvoff);
-			return -1;
-		}
-
-		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		rvoff = rv_offset(i, off, ctx);
+		emit_jump_and_link(RV_REG_ZERO, rvoff, false, ctx);
 		break;
 
 	/* IF (dst COND src) JUMP off */
 	case BPF_JMP | BPF_JEQ | BPF_X:
 	case BPF_JMP32 | BPF_JEQ | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_beq(rd, rs, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JGT | BPF_X:
 	case BPF_JMP32 | BPF_JGT | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bltu(rs, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JLT | BPF_X:
 	case BPF_JMP32 | BPF_JLT | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bltu(rd, rs, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JGE | BPF_X:
 	case BPF_JMP32 | BPF_JGE | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bgeu(rd, rs, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JLE | BPF_X:
 	case BPF_JMP32 | BPF_JLE | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bgeu(rs, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JNE | BPF_X:
 	case BPF_JMP32 | BPF_JNE | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bne(rd, rs, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSGT | BPF_X:
 	case BPF_JMP32 | BPF_JSGT | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_sext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_blt(rs, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSLT | BPF_X:
 	case BPF_JMP32 | BPF_JSLT | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_sext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_blt(rd, rs, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSGE | BPF_X:
 	case BPF_JMP32 | BPF_JSGE | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_sext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bge(rd, rs, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSLE | BPF_X:
 	case BPF_JMP32 | BPF_JSLE | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_sext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_bge(rs, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSET | BPF_X:
 	case BPF_JMP32 | BPF_JSET | BPF_X:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		if (!is64)
-			emit_zext_32_rd_rs(&rd, &rs, ctx);
-		emit(rv_and(RV_REG_T1, rd, rs), ctx);
-		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		rvoff = rv_offset(i, off, ctx);
+		if (!is64) {
+			s = ctx->ninsns;
+			if (is_signed_bpf_cond(BPF_OP(code)))
+				emit_sext_32_rd_rs(&rd, &rs, ctx);
+			else
+				emit_zext_32_rd_rs(&rd, &rs, ctx);
+			e = ctx->ninsns;
+
+			/* Adjust for extra insns */
+			rvoff -= (e - s) << 2;
+		}
+
+		if (BPF_OP(code) == BPF_JSET) {
+			/* Adjust for and */
+			rvoff -= 4;
+			emit(rv_and(RV_REG_T1, rd, rs), ctx);
+			emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff,
+				    ctx);
+		} else {
+			emit_branch(BPF_OP(code), rd, rs, rvoff, ctx);
+		}
 		break;
 
 	/* IF (dst COND imm) JUMP off */
 	case BPF_JMP | BPF_JEQ | BPF_K:
 	case BPF_JMP32 | BPF_JEQ | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_beq(rd, RV_REG_T1, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JGT | BPF_K:
 	case BPF_JMP32 | BPF_JGT | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_bltu(RV_REG_T1, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JLT | BPF_K:
 	case BPF_JMP32 | BPF_JLT | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_bltu(rd, RV_REG_T1, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JGE | BPF_K:
 	case BPF_JMP32 | BPF_JGE | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_bgeu(rd, RV_REG_T1, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JLE | BPF_K:
 	case BPF_JMP32 | BPF_JLE | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_bgeu(RV_REG_T1, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JNE | BPF_K:
 	case BPF_JMP32 | BPF_JNE | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_bne(rd, RV_REG_T1, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSGT | BPF_K:
 	case BPF_JMP32 | BPF_JSGT | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_sext_32_rd(&rd, ctx);
-		emit(rv_blt(RV_REG_T1, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSLT | BPF_K:
 	case BPF_JMP32 | BPF_JSLT | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_sext_32_rd(&rd, ctx);
-		emit(rv_blt(rd, RV_REG_T1, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSGE | BPF_K:
 	case BPF_JMP32 | BPF_JSGE | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_sext_32_rd(&rd, ctx);
-		emit(rv_bge(rd, RV_REG_T1, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSLE | BPF_K:
 	case BPF_JMP32 | BPF_JSLE | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
-		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_sext_32_rd(&rd, ctx);
-		emit(rv_bge(RV_REG_T1, rd, rvoff >> 1), ctx);
-		break;
 	case BPF_JMP | BPF_JSET | BPF_K:
 	case BPF_JMP32 | BPF_JSET | BPF_K:
-		if (rv_offset_check(&rvoff, off, i, ctx))
-			return -1;
+		rvoff = rv_offset(i, off, ctx);
+		s = ctx->ninsns;
 		emit_imm(RV_REG_T1, imm, ctx);
-		if (!is64)
-			emit_zext_32_rd_t1(&rd, ctx);
-		emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
-		emit(rv_bne(RV_REG_T1, RV_REG_ZERO, rvoff >> 1), ctx);
+		if (!is64) {
+			if (is_signed_bpf_cond(BPF_OP(code)))
+				emit_sext_32_rd(&rd, ctx);
+			else
+				emit_zext_32_rd_t1(&rd, ctx);
+		}
+		e = ctx->ninsns;
+
+		/* Adjust for extra insns */
+		rvoff -= (e - s) << 2;
+
+		if (BPF_OP(code) == BPF_JSET) {
+			/* Adjust for and */
+			rvoff -= 4;
+			emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
+			emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff,
+				    ctx);
+		} else {
+			emit_branch(BPF_OP(code), rd, RV_REG_T1, rvoff, ctx);
+		}
 		break;
 
 	/* function call */
 	case BPF_JMP | BPF_CALL:
 	{
 		bool fixed;
-		int i, ret;
+		int ret;
 		u64 addr;
 
 		mark_call(ctx);
@@ -1215,20 +1240,9 @@ out_be:
 					    &fixed);
 		if (ret < 0)
 			return ret;
-		if (fixed) {
-			emit_imm(RV_REG_T1, addr, ctx);
-		} else {
-			i = ctx->ninsns;
-			emit_imm(RV_REG_T1, addr, ctx);
-			for (i = ctx->ninsns - i; i < 8; i++) {
-				/* nop */
-				emit(rv_addi(RV_REG_ZERO, RV_REG_ZERO, 0),
-				     ctx);
-			}
-		}
-		emit(rv_jalr(RV_REG_RA, RV_REG_T1, 0), ctx);
-		rd = bpf_to_rv_reg(BPF_REG_0, ctx);
-		emit(rv_addi(rd, RV_REG_A0, 0), ctx);
+		ret = emit_call(fixed, addr, ctx);
+		if (ret)
+			return ret;
 		break;
 	}
 	/* tail call */
@@ -1243,9 +1257,7 @@ out_be:
 			break;
 
 		rvoff = epilogue_offset(ctx);
-		if (is_21b_check(rvoff, i))
-			return -1;
-		emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx);
+		emit_jump_and_link(RV_REG_ZERO, rvoff, false, ctx);
 		break;
 
 	/* dst = imm64 */
@@ -1426,6 +1438,10 @@ static void build_prologue(struct rv_jit_context *ctx)
 {
 	int stack_adjust = 0, store_offset, bpf_stack_adjust;
 
+	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
+	if (bpf_stack_adjust)
+		mark_fp(ctx);
+
 	if (seen_reg(RV_REG_RA, ctx))
 		stack_adjust += 8;
 	stack_adjust += 8; /* RV_REG_FP */
@@ -1443,7 +1459,6 @@ static void build_prologue(struct rv_jit_context *ctx)
 		stack_adjust += 8;
 
 	stack_adjust = round_up(stack_adjust, 16);
-	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
 	stack_adjust += bpf_stack_adjust;
 
 	store_offset = stack_adjust - 8;
@@ -1502,10 +1517,10 @@ static void build_prologue(struct rv_jit_context *ctx)
 
 static void build_epilogue(struct rv_jit_context *ctx)
 {
-	__build_epilogue(RV_REG_RA, ctx);
+	__build_epilogue(false, ctx);
 }
 
-static int build_body(struct rv_jit_context *ctx, bool extra_pass)
+static int build_body(struct rv_jit_context *ctx, bool extra_pass, int *offset)
 {
 	const struct bpf_prog *prog = ctx->prog;
 	int i;
@@ -1517,12 +1532,12 @@ static int build_body(struct rv_jit_context *ctx, bool extra_pass)
 		ret = emit_insn(insn, ctx, extra_pass);
 		if (ret > 0) {
 			i++;
-			if (ctx->insns == NULL)
-				ctx->offset[i] = ctx->ninsns;
+			if (offset)
+				offset[i] = ctx->ninsns;
 			continue;
 		}
-		if (ctx->insns == NULL)
-			ctx->offset[i] = ctx->ninsns;
+		if (offset)
+			offset[i] = ctx->ninsns;
 		if (ret)
 			return ret;
 	}
@@ -1548,9 +1563,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	bool tmp_blinded = false, extra_pass = false;
 	struct bpf_prog *tmp, *orig_prog = prog;
+	int pass = 0, prev_ninsns = 0, i;
 	struct rv_jit_data *jit_data;
+	unsigned int image_size = 0;
 	struct rv_jit_context *ctx;
-	unsigned int image_size;
 
 	if (!prog->jit_requested)
 		return orig_prog;
@@ -1587,33 +1603,59 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		prog = orig_prog;
 		goto out_offset;
 	}
+	for (i = 0; i < prog->len; i++) {
+		prev_ninsns += 32;
+		ctx->offset[i] = prev_ninsns;
+	}
 
-	/* First pass generates the ctx->offset, but does not emit an image. */
-	if (build_body(ctx, extra_pass)) {
-		prog = orig_prog;
-		goto out_offset;
+	for (i = 0; i < 16; i++) {
+		pass++;
+		ctx->ninsns = 0;
+		if (build_body(ctx, extra_pass, ctx->offset)) {
+			prog = orig_prog;
+			goto out_offset;
+		}
+		build_prologue(ctx);
+		ctx->epilogue_offset = ctx->ninsns;
+		build_epilogue(ctx);
+
+		if (ctx->ninsns == prev_ninsns) {
+			if (jit_data->header)
+				break;
+
+			image_size = sizeof(u32) * ctx->ninsns;
+			jit_data->header =
+				bpf_jit_binary_alloc(image_size,
+						     &jit_data->image,
+						     sizeof(u32),
+						     bpf_fill_ill_insns);
+			if (!jit_data->header) {
+				prog = orig_prog;
+				goto out_offset;
+			}
+
+			ctx->insns = (u32 *)jit_data->image;
+			/* Now, when the image is allocated, the image
+			 * can potentially shrink more (auipc/jalr ->
+			 * jal).
+			 */
+		}
+		prev_ninsns = ctx->ninsns;
 	}
-	build_prologue(ctx);
-	ctx->epilogue_offset = ctx->ninsns;
-	build_epilogue(ctx);
 
-	/* Allocate image, now that we know the size. */
-	image_size = sizeof(u32) * ctx->ninsns;
-	jit_data->header = bpf_jit_binary_alloc(image_size, &jit_data->image,
-						sizeof(u32),
-						bpf_fill_ill_insns);
-	if (!jit_data->header) {
+	if (i == 16) {
+		pr_err("bpf-jit: image did not converge in <%d passes!\n", i);
+		bpf_jit_binary_free(jit_data->header);
 		prog = orig_prog;
 		goto out_offset;
 	}
 
-	/* Second, real pass, that acutally emits the image. */
-	ctx->insns = (u32 *)jit_data->image;
 skip_init_ctx:
+	pass++;
 	ctx->ninsns = 0;
 
 	build_prologue(ctx);
-	if (build_body(ctx, extra_pass)) {
+	if (build_body(ctx, extra_pass, NULL)) {
 		bpf_jit_binary_free(jit_data->header);
 		prog = orig_prog;
 		goto out_offset;
@@ -1621,7 +1663,7 @@ skip_init_ctx:
 	build_epilogue(ctx);
 
 	if (bpf_jit_enable > 1)
-		bpf_jit_dump(prog->len, image_size, 2, ctx->insns);
+		bpf_jit_dump(prog->len, image_size, pass, ctx->insns);
 
 	prog->bpf_func = (void *)ctx->insns;
 	prog->jited = 1;
@@ -1641,3 +1683,16 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
+
+void *bpf_jit_alloc_exec(unsigned long size)
+{
+	return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
+				    BPF_JIT_REGION_END, GFP_KERNEL,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
+
+void bpf_jit_free_exec(void *addr)
+{
+	return vfree(addr);
+}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e8949953660..1f6a0388a65f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	select ARCH_WANT_DEFAULT_BPF_JIT	if X86_64
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_HUGE_PMD_SHARE
 	select ARCH_WANTS_THP_SWAP		if X86_64
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b8be18427277..4c8a2d1f8470 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -10,10 +10,12 @@
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
 #include <linux/memory.h>
+#include <linux/sort.h>
 #include <asm/extable.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
+#include <asm/asm-prototypes.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -1530,6 +1532,154 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
 	return 0;
 }
 
+static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	s64 offset;
+
+	offset = func - (ip + 2 + 4);
+	if (!is_simm32(offset)) {
+		pr_err("Target %p is out of range\n", func);
+		return -EINVAL;
+	}
+	EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
+	*pprog = prog;
+	return 0;
+}
+
+static void emit_nops(u8 **pprog, unsigned int len)
+{
+	unsigned int i, noplen;
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	while (len > 0) {
+		noplen = len;
+
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+
+		for (i = 0; i < noplen; i++)
+			EMIT1(ideal_nops[noplen][i]);
+		len -= noplen;
+	}
+
+	*pprog = prog;
+}
+
+static int emit_fallback_jump(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int err = 0;
+
+#ifdef CONFIG_RETPOLINE
+	/* Note that this assumes the the compiler uses external
+	 * thunks for indirect calls. Both clang and GCC use the same
+	 * naming convention for external thunks.
+	 */
+	err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
+#else
+	int cnt = 0;
+
+	EMIT2(0xFF, 0xE2);	/* jmp rdx */
+#endif
+	*pprog = prog;
+	return err;
+}
+
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
+{
+	u8 *jg_reloc, *jg_target, *prog = *pprog;
+	int pivot, err, jg_bytes = 1, cnt = 0;
+	s64 jg_offset;
+
+	if (a == b) {
+		/* Leaf node of recursion, i.e. not a range of indices
+		 * anymore.
+		 */
+		EMIT1(add_1mod(0x48, BPF_REG_3));	/* cmp rdx,func */
+		if (!is_simm32(progs[a]))
+			return -1;
+		EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
+			    progs[a]);
+		err = emit_cond_near_jump(&prog,	/* je func */
+					  (void *)progs[a], prog,
+					  X86_JE);
+		if (err)
+			return err;
+
+		err = emit_fallback_jump(&prog);	/* jmp thunk/indirect */
+		if (err)
+			return err;
+
+		*pprog = prog;
+		return 0;
+	}
+
+	/* Not a leaf node, so we pivot, and recursively descend into
+	 * the lower and upper ranges.
+	 */
+	pivot = (b - a) / 2;
+	EMIT1(add_1mod(0x48, BPF_REG_3));		/* cmp rdx,func */
+	if (!is_simm32(progs[a + pivot]))
+		return -1;
+	EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
+
+	if (pivot > 2) {				/* jg upper_part */
+		/* Require near jump. */
+		jg_bytes = 4;
+		EMIT2_off32(0x0F, X86_JG + 0x10, 0);
+	} else {
+		EMIT2(X86_JG, 0);
+	}
+	jg_reloc = prog;
+
+	err = emit_bpf_dispatcher(&prog, a, a + pivot,	/* emit lower_part */
+				  progs);
+	if (err)
+		return err;
+
+	/* From Intel 64 and IA-32 Architectures Optimization
+	 * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
+	 * Coding Rule 11: All branch targets should be 16-byte
+	 * aligned.
+	 */
+	jg_target = PTR_ALIGN(prog, 16);
+	if (jg_target != prog)
+		emit_nops(&prog, jg_target - prog);
+	jg_offset = prog - jg_reloc;
+	emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
+
+	err = emit_bpf_dispatcher(&prog, a + pivot + 1,	/* emit upper_part */
+				  b, progs);
+	if (err)
+		return err;
+
+	*pprog = prog;
+	return 0;
+}
+
+static int cmp_ips(const void *a, const void *b)
+{
+	const s64 *ipa = a;
+	const s64 *ipb = b;
+
+	if (*ipa > *ipb)
+		return 1;
+	if (*ipa < *ipb)
+		return -1;
+	return 0;
+}
+
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+	u8 *prog = image;
+
+	sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
+	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
+}
+
 struct x64_jit_data {
 	struct bpf_binary_header *header;
 	int *addrs;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index f73cd917c44f..42058fad6a3c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -269,7 +269,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
 
 	bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
-	xsk_umem_discard_addr(umem);
+	xsk_umem_release_addr(umem);
 	return true;
 }
 
@@ -306,7 +306,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
 
 	bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
-	xsk_umem_discard_addr_rq(umem);
+	xsk_umem_release_addr_rq(umem);
 	return true;
 }
 
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index cf9b8b22d24f..0af1f0b951c0 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -555,7 +555,7 @@ ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 
 	rx_buf->handle = handle + umem->headroom;
 
-	xsk_umem_discard_addr(umem);
+	xsk_umem_release_addr(umem);
 	return true;
 }
 
@@ -591,7 +591,7 @@ ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 
 	rx_buf->handle = handle + umem->headroom;
 
-	xsk_umem_discard_addr_rq(umem);
+	xsk_umem_release_addr_rq(umem);
 	return true;
 }
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index b43be9f14105..74b540ebb3dc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -277,7 +277,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
 
 	bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
-	xsk_umem_discard_addr(umem);
+	xsk_umem_release_addr(umem);
 	return true;
 }
 
@@ -304,7 +304,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
 
 	bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
-	xsk_umem_discard_addr_rq(umem);
+	xsk_umem_release_addr_rq(umem);
 	return true;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 475b6bd5d29b..62fc8a128a8d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -35,7 +35,7 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
 	 */
 	dma_info->addr = xdp_umem_get_dma(umem, handle);
 
-	xsk_umem_discard_addr_rq(umem);
+	xsk_umem_release_addr_rq(umem);
 
 	dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
 				   DMA_BIDIRECTIONAL);
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 9be71c195d74..a11d5b7dbbf3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -85,6 +85,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp);
 void cgroup_bpf_offline(struct cgroup *cgrp);
 
 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			struct bpf_prog *replace_prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type);
@@ -93,7 +94,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, u32 flags);
+		      struct bpf_prog *replace_prog, enum bpf_attach_type type,
+		      u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 085a59afba85..b14e51d56a82 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -471,11 +471,69 @@ struct bpf_trampoline {
 	void *image;
 	u64 selector;
 };
+
+#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */
+
+struct bpf_dispatcher_prog {
+	struct bpf_prog *prog;
+	refcount_t users;
+};
+
+struct bpf_dispatcher {
+	/* dispatcher mutex */
+	struct mutex mutex;
+	void *func;
+	struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
+	int num_progs;
+	void *image;
+	u32 image_off;
+};
+
+static __always_inline unsigned int bpf_dispatcher_nopfunc(
+	const void *ctx,
+	const struct bpf_insn *insnsi,
+	unsigned int (*bpf_func)(const void *,
+				 const struct bpf_insn *))
+{
+	return bpf_func(ctx, insnsi);
+}
 #ifdef CONFIG_BPF_JIT
 struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
 int bpf_trampoline_link_prog(struct bpf_prog *prog);
 int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
+void *bpf_jit_alloc_exec_page(void);
+#define BPF_DISPATCHER_INIT(name) {			\
+	.mutex = __MUTEX_INITIALIZER(name.mutex),	\
+	.func = &name##func,				\
+	.progs = {},					\
+	.num_progs = 0,					\
+	.image = NULL,					\
+	.image_off = 0					\
+}
+
+#define DEFINE_BPF_DISPATCHER(name)					\
+	noinline unsigned int name##func(				\
+		const void *ctx,					\
+		const struct bpf_insn *insnsi,				\
+		unsigned int (*bpf_func)(const void *,			\
+					 const struct bpf_insn *))	\
+	{								\
+		return bpf_func(ctx, insnsi);				\
+	}								\
+	EXPORT_SYMBOL(name##func);			\
+	struct bpf_dispatcher name = BPF_DISPATCHER_INIT(name);
+#define DECLARE_BPF_DISPATCHER(name)					\
+	unsigned int name##func(					\
+		const void *ctx,					\
+		const struct bpf_insn *insnsi,				\
+		unsigned int (*bpf_func)(const void *,			\
+					 const struct bpf_insn *));	\
+	extern struct bpf_dispatcher name;
+#define BPF_DISPATCHER_FUNC(name) name##func
+#define BPF_DISPATCHER_PTR(name) (&name)
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+				struct bpf_prog *to);
 #else
 static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
@@ -490,6 +548,13 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
 	return -ENOTSUPP;
 }
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
+#define DEFINE_BPF_DISPATCHER(name)
+#define DECLARE_BPF_DISPATCHER(name)
+#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nopfunc
+#define BPF_DISPATCHER_PTR(name) NULL
+static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
+					      struct bpf_prog *from,
+					      struct bpf_prog *to) {}
 #endif
 
 struct bpf_func_info_aux {
@@ -897,14 +962,14 @@ struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
-void __dev_map_flush(struct bpf_map *map);
+void __dev_map_flush(void);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
-void __cpu_map_flush(struct bpf_map *map);
+void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 
@@ -943,6 +1008,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 
 int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog);
 
+struct bpf_prog *bpf_prog_by_id(u32 id);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1004,7 +1071,7 @@ static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map
 	return NULL;
 }
 
-static inline void __dev_map_flush(struct bpf_map *map)
+static inline void __dev_map_flush(void)
 {
 }
 
@@ -1033,7 +1100,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 	return NULL;
 }
 
-static inline void __cpu_map_flush(struct bpf_map *map)
+static inline void __cpu_map_flush(void)
 {
 }
 
@@ -1074,6 +1141,11 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
+
+static inline struct bpf_prog *bpf_prog_by_id(u32 id)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 345f3748e0fb..70e6dd960bca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -559,23 +559,26 @@ struct sk_filter {
 
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
-#define BPF_PROG_RUN(prog, ctx)	({				\
-	u32 ret;						\
-	cant_sleep();						\
-	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
-		struct bpf_prog_stats *stats;			\
-		u64 start = sched_clock();			\
-		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
-		stats = this_cpu_ptr(prog->aux->stats);		\
-		u64_stats_update_begin(&stats->syncp);		\
-		stats->cnt++;					\
-		stats->nsecs += sched_clock() - start;		\
-		u64_stats_update_end(&stats->syncp);		\
-	} else {						\
-		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
-	}							\
+#define __BPF_PROG_RUN(prog, ctx, dfunc)	({			\
+	u32 ret;							\
+	cant_sleep();							\
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {		\
+		struct bpf_prog_stats *stats;				\
+		u64 start = sched_clock();				\
+		ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
+		stats = this_cpu_ptr(prog->aux->stats);			\
+		u64_stats_update_begin(&stats->syncp);			\
+		stats->cnt++;						\
+		stats->nsecs += sched_clock() - start;			\
+		u64_stats_update_end(&stats->syncp);			\
+	} else {							\
+		ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
+	}								\
 	ret; })
 
+#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx,		\
+					       bpf_dispatcher_nopfunc)
+
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
 struct bpf_skb_data_end {
@@ -589,7 +592,6 @@ struct bpf_redirect_info {
 	u32 tgt_index;
 	void *tgt_value;
 	struct bpf_map *map;
-	struct bpf_map *map_to_flush;
 	u32 kern_flags;
 };
 
@@ -699,6 +701,8 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return res;
 }
 
+DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 					    struct xdp_buff *xdp)
 {
@@ -708,9 +712,12 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	 * already takes rcu_read_lock() when fetching the program, so
 	 * it's not necessary here anymore.
 	 */
-	return BPF_PROG_RUN(prog, xdp);
+	return __BPF_PROG_RUN(prog, xdp,
+			      BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp));
 }
 
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);
+
 static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
 {
 	return prog->len * sizeof(struct bpf_insn);
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e3780e4b74e1..e86ec48ef627 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -72,7 +72,6 @@ struct xdp_umem {
 
 struct xsk_map {
 	struct bpf_map map;
-	struct list_head __percpu *flush_list;
 	spinlock_t lock; /* Synchronize map updates */
 	struct xdp_sock *xsk_map[];
 };
@@ -119,8 +118,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 /* Used from netdev driver */
 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
-u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
-void xsk_umem_discard_addr(struct xdp_umem *umem);
+bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
+void xsk_umem_release_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
@@ -139,9 +138,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 			     struct xdp_sock **map_entry);
 int xsk_map_inc(struct xsk_map *map);
 void xsk_map_put(struct xsk_map *map);
-int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
-		       struct xdp_sock *xs);
-void __xsk_map_flush(struct bpf_map *map);
+int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
+void __xsk_map_flush(void);
 
 static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
 						     u32 key)
@@ -199,7 +197,7 @@ static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
 	return xsk_umem_has_addrs(umem, cnt - rq->length);
 }
 
-static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
 {
 	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
 
@@ -210,12 +208,12 @@ static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
 	return addr;
 }
 
-static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
 {
 	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
 
 	if (!rq->length)
-		xsk_umem_discard_addr(umem);
+		xsk_umem_release_addr(umem);
 	else
 		rq->length--;
 }
@@ -260,7 +258,7 @@ static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 	return NULL;
 }
 
-static inline void xsk_umem_discard_addr(struct xdp_umem *umem)
+static inline void xsk_umem_release_addr(struct xdp_umem *umem)
 {
 }
 
@@ -334,7 +332,7 @@ static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
 	return NULL;
 }
 
-static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
 {
 }
 
@@ -369,13 +367,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
 	return 0;
 }
 
-static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
-				     struct xdp_sock *xs)
+static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline void __xsk_map_flush(struct bpf_map *map)
+static inline void __xsk_map_flush(void)
 {
 }
 
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 3ad935527177..a534d71e689a 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -116,6 +116,7 @@
 #define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 #define AUDIT_TIME_INJOFFSET	1332	/* Timekeeping offset injected */
 #define AUDIT_TIME_ADJNTPVAL	1333	/* NTP value adjustment */
+#define AUDIT_BPF		1334	/* BPF subsystem */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dbbcf0b02970..7df436da542d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -231,6 +231,11 @@ enum bpf_attach_type {
  * When children program makes decision (like picking TCP CA or sock bind)
  * parent program has a chance to override it.
  *
+ * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of
+ * programs for a cgroup. Though it's possible to replace an old program at
+ * any position by also specifying BPF_F_REPLACE flag and position itself in
+ * replace_bpf_fd attribute. Old program at this position will be released.
+ *
  * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
  * A cgroup with NONE doesn't allow any programs in sub-cgroups.
  * Ex1:
@@ -249,6 +254,7 @@ enum bpf_attach_type {
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
 #define BPF_F_ALLOW_MULTI	(1U << 1)
+#define BPF_F_REPLACE		(1U << 2)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
@@ -442,6 +448,10 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
+		__u32		replace_bpf_fd;	/* previously attached eBPF
+						 * program to replace if
+						 * BPF_F_REPLACE is used
+						 */
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index c02dec97e1ce..1a2898c482ee 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -142,7 +142,8 @@ struct btf_param {
 
 enum {
 	BTF_VAR_STATIC = 0,
-	BTF_VAR_GLOBAL_ALLOCATED,
+	BTF_VAR_GLOBAL_ALLOCATED = 1,
+	BTF_VAR_GLOBAL_EXTERN = 2,
 };
 
 /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe
diff --git a/init/Kconfig b/init/Kconfig
index a34064a031a5..890aaa62efde 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1604,6 +1604,9 @@ config BPF_SYSCALL
 	  Enable the bpf() system call that allows to manipulate eBPF
 	  programs and maps via file descriptors.
 
+config ARCH_WANT_DEFAULT_BPF_JIT
+	bool
+
 config BPF_JIT_ALWAYS_ON
 	bool "Permanently enable BPF JIT and remove BPF interpreter"
 	depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
@@ -1611,6 +1614,10 @@ config BPF_JIT_ALWAYS_ON
 	  Enables BPF JIT and removes BPF interpreter to avoid
 	  speculative execution of BPF instructions by the interpreter
 
+config BPF_JIT_DEFAULT_ON
+	def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
+	depends on HAVE_EBPF_JIT && BPF_JIT
+
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
 	depends on MMU
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3f671bf617e8..d4f330351f87 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4fb20ab179fe..ee871d4a51f7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -103,8 +103,7 @@ static u32 prog_list_length(struct list_head *head)
  * if parent has overridable or multi-prog, allow attaching
  */
 static bool hierarchy_allows_attach(struct cgroup *cgrp,
-				    enum bpf_attach_type type,
-				    u32 new_flags)
+				    enum bpf_attach_type type)
 {
 	struct cgroup *p;
 
@@ -283,31 +282,34 @@ cleanup:
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
  * @prog: A program to attach
+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
  * @type: Type of attach operation
  * @flags: Option flags
  *
  * Must be called with cgroup_mutex held.
  */
 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			struct bpf_prog *replace_prog,
 			enum bpf_attach_type type, u32 flags)
 {
+	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
 		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+	struct bpf_prog_list *pl, *replace_pl = NULL;
 	enum bpf_cgroup_storage_type stype;
-	struct bpf_prog_list *pl;
-	bool pl_was_allocated;
 	int err;
 
-	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
+	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
 		/* invalid combination */
 		return -EINVAL;
 
-	if (!hierarchy_allows_attach(cgrp, type, flags))
+	if (!hierarchy_allows_attach(cgrp, type))
 		return -EPERM;
 
-	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
 		/* Disallow attaching non-overridable on top
 		 * of existing overridable in this cgroup.
 		 * Disallow attaching multi-prog if overridable or none
@@ -317,6 +319,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 		return -E2BIG;
 
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+			if (pl->prog == replace_prog)
+				replace_pl = pl;
+		}
+		if ((flags & BPF_F_REPLACE) && !replace_pl)
+			/* prog to replace not found for cgroup */
+			return -ENOENT;
+	} else if (!list_empty(progs)) {
+		replace_pl = list_first_entry(progs, typeof(*pl), node);
+	}
+
 	for_each_cgroup_storage_type(stype) {
 		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
 		if (IS_ERR(storage[stype])) {
@@ -327,53 +344,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		}
 	}
 
-	if (flags & BPF_F_ALLOW_MULTI) {
-		list_for_each_entry(pl, progs, node) {
-			if (pl->prog == prog) {
-				/* disallow attaching the same prog twice */
-				for_each_cgroup_storage_type(stype)
-					bpf_cgroup_storage_free(storage[stype]);
-				return -EINVAL;
-			}
+	if (replace_pl) {
+		pl = replace_pl;
+		old_prog = pl->prog;
+		for_each_cgroup_storage_type(stype) {
+			old_storage[stype] = pl->storage[stype];
+			bpf_cgroup_storage_unlink(old_storage[stype]);
 		}
-
+	} else {
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
 			for_each_cgroup_storage_type(stype)
 				bpf_cgroup_storage_free(storage[stype]);
 			return -ENOMEM;
 		}
-
-		pl_was_allocated = true;
-		pl->prog = prog;
-		for_each_cgroup_storage_type(stype)
-			pl->storage[stype] = storage[stype];
 		list_add_tail(&pl->node, progs);
-	} else {
-		if (list_empty(progs)) {
-			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
-			if (!pl) {
-				for_each_cgroup_storage_type(stype)
-					bpf_cgroup_storage_free(storage[stype]);
-				return -ENOMEM;
-			}
-			pl_was_allocated = true;
-			list_add_tail(&pl->node, progs);
-		} else {
-			pl = list_first_entry(progs, typeof(*pl), node);
-			old_prog = pl->prog;
-			for_each_cgroup_storage_type(stype) {
-				old_storage[stype] = pl->storage[stype];
-				bpf_cgroup_storage_unlink(old_storage[stype]);
-			}
-			pl_was_allocated = false;
-		}
-		pl->prog = prog;
-		for_each_cgroup_storage_type(stype)
-			pl->storage[stype] = storage[stype];
 	}
 
-	cgrp->bpf.flags[type] = flags;
+	pl->prog = prog;
+	for_each_cgroup_storage_type(stype)
+		pl->storage[stype] = storage[stype];
+
+	cgrp->bpf.flags[type] = saved_flags;
 
 	err = update_effective_progs(cgrp, type);
 	if (err)
@@ -401,7 +393,7 @@ cleanup:
 		pl->storage[stype] = old_storage[stype];
 		bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
 	}
-	if (pl_was_allocated) {
+	if (!replace_pl) {
 		list_del(&pl->node);
 		kfree(pl);
 	}
@@ -539,6 +531,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
 {
+	struct bpf_prog *replace_prog = NULL;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -546,8 +539,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 	if (IS_ERR(cgrp))
 		return PTR_ERR(cgrp);
 
-	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
+	    (attr->attach_flags & BPF_F_REPLACE)) {
+		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
+		if (IS_ERR(replace_prog)) {
+			cgroup_put(cgrp);
+			return PTR_ERR(replace_prog);
+		}
+	}
+
+	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
 				attr->attach_flags);
+
+	if (replace_prog)
+		bpf_prog_put(replace_prog);
 	cgroup_put(cgrp);
 	return ret;
 }
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index af6b738cf435..29d47aae0dd1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	u32 pages, delta;
 	int ret;
 
-	BUG_ON(fp_old == NULL);
-
 	size = round_up(size, PAGE_SIZE);
 	pages = size / PAGE_SIZE;
 	if (pages <= fp_old->pages)
@@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
 
 #ifdef CONFIG_BPF_JIT
 /* All BPF JIT sysctl knobs here. */
-int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
 int bpf_jit_harden   __read_mostly;
-int bpf_jit_kallsyms __read_mostly;
 long bpf_jit_limit   __read_mostly;
 
 static __always_inline void
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..70f71b154fa5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -72,17 +72,18 @@ struct bpf_cpu_map {
 	struct bpf_map map;
 	/* Below members specific for map type */
 	struct bpf_cpu_map_entry **cpu_map;
-	struct list_head __percpu *flush_list;
 };
 
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
+static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
+
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
 
 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_cpu_map *cmap;
 	int err = -ENOMEM;
-	int ret, cpu;
 	u64 cost;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
-	cost += sizeof(struct list_head) * num_possible_cpus();
 
 	/* Notice returns -EPERM on if map size is larger than memlock limit */
 	ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 		goto free_cmap;
 	}
 
-	cmap->flush_list = alloc_percpu(struct list_head);
-	if (!cmap->flush_list)
-		goto free_charge;
-
-	for_each_possible_cpu(cpu)
-		INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
-
 	/* Alloc array for possible remote "destination" CPUs */
 	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
 					   sizeof(struct bpf_cpu_map_entry *),
 					   cmap->map.numa_node);
 	if (!cmap->cpu_map)
-		goto free_percpu;
+		goto free_charge;
 
 	return &cmap->map;
-free_percpu:
-	free_percpu(cmap->flush_list);
 free_charge:
 	bpf_map_charge_finish(&cmap->map.memory);
 free_cmap:
@@ -399,22 +390,14 @@ free_rcu:
 static void __cpu_map_entry_free(struct rcu_head *rcu)
 {
 	struct bpf_cpu_map_entry *rcpu;
-	int cpu;
 
 	/* This cpu_map_entry have been disconnected from map and one
-	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * RCU grace-period have elapsed.  Thus, XDP cannot queue any
 	 * new packets and cannot change/set flush_needed that can
 	 * find this entry.
 	 */
 	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
 
-	/* Flush remaining packets in percpu bulkq */
-	for_each_online_cpu(cpu) {
-		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
-
-		/* No concurrent bq_enqueue can run at this point */
-		bq_flush_to_queue(bq, false);
-	}
 	free_percpu(rcpu->bulkq);
 	/* Cannot kthread_stop() here, last put free rcpu resources */
 	put_cpu_map_entry(rcpu);
@@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
  * percpu bulkq to queue.  Due to caller map_delete_elem() disable
  * preemption, cannot call kthread_stop() to make sure queue is empty.
  * Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * cpu_map_kthread_stop, which waits for an RCU grace period before
  * stopping kthread, emptying the queue.
  */
 static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
@@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
 static void cpu_map_free(struct bpf_map *map)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
-	int cpu;
 	u32 i;
 
 	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
@@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map)
 	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
-	/* To ensure all pending flush operations have completed wait for flush
-	 * list be empty on _all_ cpus. Because the above synchronize_rcu()
-	 * ensures the map is disconnected from the program we can assume no new
-	 * items will be added to the list.
-	 */
-	for_each_online_cpu(cpu) {
-		struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
-
-		while (!list_empty(flush_list))
-			cond_resched();
-	}
-
 	/* For cpu_map the remote CPUs can still be using the entries
 	 * (struct bpf_cpu_map_entry).
 	 */
@@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map)
 		if (!rcpu)
 			continue;
 
-		/* bq flush and cleanup happens after RCU graze-period */
+		/* bq flush and cleanup happens after RCU grace-period */
 		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
 	}
-	free_percpu(cmap->flush_list);
 	bpf_map_area_free(cmap->cpu_map);
 	kfree(cmap);
 }
@@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = {
 	.map_check_btf		= map_check_no_btf,
 };
 
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
 {
 	struct bpf_cpu_map_entry *rcpu = bq->obj;
 	unsigned int processed = 0, drops = 0;
@@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			if (likely(in_napi_ctx))
-				xdp_return_frame_rx_napi(xdpf);
-			else
-				xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
@@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
  */
 static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
-	struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
+	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
 	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
-		bq_flush_to_queue(bq, true);
+		bq_flush_to_queue(bq);
 
 	/* Notice, xdp_buff/page MUST be queued here, long enough for
 	 * driver to code invoking us to finished, due to driver
@@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 	return 0;
 }
 
-void __cpu_map_flush(struct bpf_map *map)
+void __cpu_map_flush(void)
 {
-	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
-	struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
 	struct xdp_bulk_queue *bq, *tmp;
 
 	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
-		bq_flush_to_queue(bq, true);
+		bq_flush_to_queue(bq);
 
 		/* If already running, costs spin_lock_irqsave + smb_mb */
 		wake_up_process(bq->obj->kthread);
 	}
 }
+
+static int __init cpu_map_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
+	return 0;
+}
+
+subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 3d3d61b5985b..da9c832fc5c8 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -75,7 +75,6 @@ struct bpf_dtab_netdev {
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
-	struct list_head __percpu *flush_list;
 	struct list_head list;
 
 	/* these are only used for DEVMAP_HASH type maps */
@@ -85,6 +84,7 @@ struct bpf_dtab {
 	u32 n_buckets;
 };
 
+static DEFINE_PER_CPU(struct list_head, dev_map_flush_list);
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 
@@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
 
 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 {
-	int err, cpu;
-	u64 cost;
+	u64 cost = 0;
+	int err;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 
 	bpf_map_init_from_attr(&dtab->map, attr);
 
-	/* make sure page count doesn't overflow */
-	cost = (u64) sizeof(struct list_head) * num_possible_cpus();
-
 	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 		dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
 
@@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 	if (err)
 		return -EINVAL;
 
-	dtab->flush_list = alloc_percpu(struct list_head);
-	if (!dtab->flush_list)
-		goto free_charge;
-
-	for_each_possible_cpu(cpu)
-		INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
-
 	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 		dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
 		if (!dtab->dev_index_head)
-			goto free_percpu;
+			goto free_charge;
 
 		spin_lock_init(&dtab->index_lock);
 	} else {
@@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 						      sizeof(struct bpf_dtab_netdev *),
 						      dtab->map.numa_node);
 		if (!dtab->netdev_map)
-			goto free_percpu;
+			goto free_charge;
 	}
 
 	return 0;
 
-free_percpu:
-	free_percpu(dtab->flush_list);
 free_charge:
 	bpf_map_charge_finish(&dtab->map.memory);
 	return -ENOMEM;
@@ -201,7 +189,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 static void dev_map_free(struct bpf_map *map)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	int i, cpu;
+	int i;
 
 	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
 	 * so the programs (can be more than one that used this map) were
@@ -221,18 +209,6 @@ static void dev_map_free(struct bpf_map *map)
 	/* Make sure prior __dev_map_entry_free() have completed. */
 	rcu_barrier();
 
-	/* To ensure all pending flush operations have completed wait for flush
-	 * list to empty on _all_ cpus.
-	 * Because the above synchronize_rcu() ensures the map is disconnected
-	 * from the program we can assume no new items will be added.
-	 */
-	for_each_online_cpu(cpu) {
-		struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
-
-		while (!list_empty(flush_list))
-			cond_resched();
-	}
-
 	if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 		for (i = 0; i < dtab->n_buckets; i++) {
 			struct bpf_dtab_netdev *dev;
@@ -266,7 +242,6 @@ static void dev_map_free(struct bpf_map *map)
 		bpf_map_area_free(dtab->netdev_map);
 	}
 
-	free_percpu(dtab->flush_list);
 	kfree(dtab);
 }
 
@@ -345,8 +320,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
 	return -ENOENT;
 }
 
-static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
-		       bool in_napi_ctx)
+static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags)
 {
 	struct bpf_dtab_netdev *obj = bq->obj;
 	struct net_device *dev = obj->dev;
@@ -384,11 +358,7 @@ error:
 	for (i = 0; i < bq->count; i++) {
 		struct xdp_frame *xdpf = bq->q[i];
 
-		/* RX path under NAPI protection, can return frames faster */
-		if (likely(in_napi_ctx))
-			xdp_return_frame_rx_napi(xdpf);
-		else
-			xdp_return_frame(xdpf);
+		xdp_return_frame_rx_napi(xdpf);
 		drops++;
 	}
 	goto out;
@@ -401,15 +371,14 @@ error:
  * net device can be torn down. On devmap tear down we ensure the flush list
  * is empty before completing to ensure all flush operations have completed.
  */
-void __dev_map_flush(struct bpf_map *map)
+void __dev_map_flush(void)
 {
-	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
+	struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list);
 	struct xdp_bulk_queue *bq, *tmp;
 
 	rcu_read_lock();
 	list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
-		bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
+		bq_xmit_all(bq, XDP_XMIT_FLUSH);
 	rcu_read_unlock();
 }
 
@@ -436,11 +405,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 		      struct net_device *dev_rx)
 
 {
-	struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
+	struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list);
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-		bq_xmit_all(bq, 0, true);
+		bq_xmit_all(bq, 0);
 
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -509,27 +478,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
 	return dev ? &dev->ifindex : NULL;
 }
 
-static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
-{
-	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
-		struct xdp_bulk_queue *bq;
-		int cpu;
-
-		rcu_read_lock();
-		for_each_online_cpu(cpu) {
-			bq = per_cpu_ptr(dev->bulkq, cpu);
-			bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
-		}
-		rcu_read_unlock();
-	}
-}
-
 static void __dev_map_entry_free(struct rcu_head *rcu)
 {
 	struct bpf_dtab_netdev *dev;
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
-	dev_map_flush_old(dev);
 	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
@@ -810,10 +763,15 @@ static struct notifier_block dev_map_notifier = {
 
 static int __init dev_map_init(void)
 {
+	int cpu;
+
 	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
 	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
 		     offsetof(struct _bpf_dtab_netdev, dev));
 	register_netdevice_notifier(&dev_map_notifier);
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu));
 	return 0;
 }
 
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..204ee61a3904
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ *                         unsigned int (*bpf_func)(const void *,
+ *                                                  const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+	struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+	int i;
+
+	for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+		if (prog == d->progs[i].prog)
+			return &d->progs[i];
+	}
+	return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+	struct bpf_dispatcher *d)
+{
+	return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+				    struct bpf_prog *prog)
+{
+	struct bpf_dispatcher_prog *entry;
+
+	if (!prog)
+		return false;
+
+	entry = bpf_dispatcher_find_prog(d, prog);
+	if (entry) {
+		refcount_inc(&entry->users);
+		return false;
+	}
+
+	entry = bpf_dispatcher_find_free(d);
+	if (!entry)
+		return false;
+
+	bpf_prog_inc(prog);
+	entry->prog = prog;
+	refcount_set(&entry->users, 1);
+	d->num_progs++;
+	return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+				       struct bpf_prog *prog)
+{
+	struct bpf_dispatcher_prog *entry;
+
+	if (!prog)
+		return false;
+
+	entry = bpf_dispatcher_find_prog(d, prog);
+	if (!entry)
+		return false;
+
+	if (refcount_dec_and_test(&entry->users)) {
+		entry->prog = NULL;
+		bpf_prog_put(prog);
+		d->num_progs--;
+		return true;
+	}
+	return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+	return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+	s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+	int i;
+
+	for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+		if (d->progs[i].prog)
+			*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+	}
+	return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+	void *old, *new;
+	u32 noff;
+	int err;
+
+	if (!prev_num_progs) {
+		old = NULL;
+		noff = 0;
+	} else {
+		old = d->image + d->image_off;
+		noff = d->image_off ^ (PAGE_SIZE / 2);
+	}
+
+	new = d->num_progs ? d->image + noff : NULL;
+	if (new) {
+		if (bpf_dispatcher_prepare(d, new))
+			return;
+	}
+
+	err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+	if (err || !new)
+		return;
+
+	d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+				struct bpf_prog *to)
+{
+	bool changed = false;
+	int prev_num_progs;
+
+	if (from == to)
+		return;
+
+	mutex_lock(&d->mutex);
+	if (!d->image) {
+		d->image = bpf_jit_alloc_exec_page();
+		if (!d->image)
+			goto out;
+	}
+
+	prev_num_progs = d->num_progs;
+	changed |= bpf_dispatcher_remove_prog(d, from);
+	changed |= bpf_dispatcher_add_prog(d, to);
+
+	if (!changed)
+		goto out;
+
+	bpf_dispatcher_update(d, prev_num_progs);
+out:
+	mutex_unlock(&d->mutex);
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3461ec59570..81ee8595dfee 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,7 @@
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
 #include <linux/nospec.h>
+#include <linux/audit.h>
 #include <uapi/linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -1306,6 +1307,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	return 0;
 }
 
+enum bpf_audit {
+	BPF_AUDIT_LOAD,
+	BPF_AUDIT_UNLOAD,
+	BPF_AUDIT_MAX,
+};
+
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
+	[BPF_AUDIT_LOAD]   = "LOAD",
+	[BPF_AUDIT_UNLOAD] = "UNLOAD",
+};
+
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
+{
+	struct audit_context *ctx = NULL;
+	struct audit_buffer *ab;
+
+	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
+		return;
+	if (audit_enabled == AUDIT_OFF)
+		return;
+	if (op == BPF_AUDIT_LOAD)
+		ctx = audit_context();
+	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+	if (unlikely(!ab))
+		return;
+	audit_log_format(ab, "prog-id=%u op=%s",
+			 prog->aux->id, bpf_audit_str[op]);
+	audit_log_end(ab);
+}
+
 int __bpf_prog_charge(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1421,6 +1452,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic64_dec_and_test(&prog->aux->refcnt)) {
 		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+		bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		__bpf_prog_put_noref(prog, true);
@@ -1830,6 +1862,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	 */
 	bpf_prog_kallsyms_add(prog);
 	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+	bpf_audit_prog(prog, BPF_AUDIT_LOAD);
 
 	err = bpf_prog_new_fd(prog);
 	if (err < 0)
@@ -2040,10 +2073,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	}
 }
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
 
 #define BPF_F_ATTACH_MASK \
-	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
@@ -2305,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
-static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_prog *bpf_prog_by_id(u32 id)
 {
 	struct bpf_prog *prog;
-	u32 id = attr->prog_id;
-	int fd;
-
-	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
-		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (!id)
+		return ERR_PTR(-ENOENT);
 
 	spin_lock_bh(&prog_idr_lock);
 	prog = idr_find(&prog_idr, id);
@@ -2324,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	else
 		prog = ERR_PTR(-ENOENT);
 	spin_unlock_bh(&prog_idr_lock);
+	return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	u32 id = attr->prog_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 
+	prog = bpf_prog_by_id(id);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 23b0d5cfd47e..505f4e4b31d2 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -14,6 +14,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
+void *bpf_jit_alloc_exec_page(void)
+{
+	void *image;
+
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (!image)
+		return NULL;
+
+	set_vm_flush_reset_perms(image);
+	/* Keep image as writeable. The alternative is to keep flipping ro/rw
+	 * everytime new program is attached or detached.
+	 */
+	set_memory_x((long)image, 1);
+	return image;
+}
+
 struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
 	struct bpf_trampoline *tr;
@@ -34,7 +50,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 		goto out;
 
 	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
-	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	image = bpf_jit_alloc_exec_page();
 	if (!image) {
 		kfree(tr);
 		tr = NULL;
@@ -48,12 +64,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	mutex_init(&tr->mutex);
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-
-	set_vm_flush_reset_perms(image);
-	/* Keep image as writeable. The alternative is to keep flipping ro/rw
-	 * everytime new program is attached or detached.
-	 */
-	set_memory_x((long)image, 1);
 	tr->image = image;
 out:
 	mutex_unlock(&trampoline_mutex);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 90c4fce1c981..2cc5c8f4c800 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
 static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_map_memory mem;
-	int cpu, err, numa_node;
+	int err, numa_node;
 	struct xsk_map *m;
-	u64 cost, size;
+	u64 size;
 
 	if (!capable(CAP_NET_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
 	numa_node = bpf_map_attr_numa_node(attr);
 	size = struct_size(m, xsk_map, attr->max_entries);
-	cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
 
-	err = bpf_map_charge_init(&mem, cost);
+	err = bpf_map_charge_init(&mem, size);
 	if (err < 0)
 		return ERR_PTR(err);
 
@@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	bpf_map_charge_move(&m->map.memory, &mem);
 	spin_lock_init(&m->lock);
 
-	m->flush_list = alloc_percpu(struct list_head);
-	if (!m->flush_list) {
-		bpf_map_charge_finish(&m->map.memory);
-		bpf_map_area_free(m);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for_each_possible_cpu(cpu)
-		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
-
 	return &m->map;
 }
 
@@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map)
 
 	bpf_clear_redirect_map(map);
 	synchronize_net();
-	free_percpu(m->flush_list);
 	bpf_map_area_free(m);
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..725365df066d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6288,12 +6288,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #ifdef CONFIG_CGROUP_BPF
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, u32 flags)
+		      struct bpf_prog *replace_prog, enum bpf_attach_type type,
+		      u32 flags)
 {
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f79205d4444f..d555c0d8657d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -15,7 +15,7 @@
 #include <trace/events/bpf_test_run.h>
 
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
-			u32 *retval, u32 *time)
+			u32 *retval, u32 *time, bool xdp)
 {
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
 	enum bpf_cgroup_storage_type stype;
@@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		bpf_cgroup_storage_set(storage);
-		*retval = BPF_PROG_RUN(prog, ctx);
+
+		if (xdp)
+			*retval = bpf_prog_run_xdp(prog, ctx);
+		else
+			*retval = BPF_PROG_RUN(prog, ctx);
 
 		if (signal_pending(current)) {
 			ret = -EINTR;
@@ -247,34 +251,53 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
 		return 0;
 
 	/* make sure the fields we don't use are zeroed */
-	if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority)))
+	if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark)))
+		return -EINVAL;
+
+	/* mark is allowed */
+
+	if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark),
+			   offsetof(struct __sk_buff, priority)))
 		return -EINVAL;
 
 	/* priority is allowed */
 
-	if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) +
-			   sizeof_field(struct __sk_buff, priority),
+	if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority),
 			   offsetof(struct __sk_buff, cb)))
 		return -EINVAL;
 
 	/* cb is allowed */
 
-	if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) +
-			   sizeof_field(struct __sk_buff, cb),
+	if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb),
 			   offsetof(struct __sk_buff, tstamp)))
 		return -EINVAL;
 
 	/* tstamp is allowed */
+	/* wire_len is allowed */
+	/* gso_segs is allowed */
 
-	if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) +
-			   sizeof_field(struct __sk_buff, tstamp),
+	if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs),
 			   sizeof(struct __sk_buff)))
 		return -EINVAL;
 
+	skb->mark = __skb->mark;
 	skb->priority = __skb->priority;
 	skb->tstamp = __skb->tstamp;
 	memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN);
 
+	if (__skb->wire_len == 0) {
+		cb->pkt_len = skb->len;
+	} else {
+		if (__skb->wire_len < skb->len ||
+		    __skb->wire_len > GSO_MAX_SIZE)
+			return -EINVAL;
+		cb->pkt_len = __skb->wire_len;
+	}
+
+	if (__skb->gso_segs > GSO_MAX_SEGS)
+		return -EINVAL;
+	skb_shinfo(skb)->gso_segs = __skb->gso_segs;
+
 	return 0;
 }
 
@@ -285,9 +308,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
 	if (!__skb)
 		return;
 
+	__skb->mark = skb->mark;
 	__skb->priority = skb->priority;
 	__skb->tstamp = skb->tstamp;
 	memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
+	__skb->wire_len = cb->pkt_len;
+	__skb->gso_segs = skb_shinfo(skb)->gso_segs;
 }
 
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
@@ -359,7 +385,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	ret = convert___skb_to_skb(skb, ctx);
 	if (ret)
 		goto out;
-	ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
+	ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false);
 	if (ret)
 		goto out;
 	if (!is_l2) {
@@ -416,8 +442,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
 	xdp.rxq = &rxqueue->xdp_rxq;
-
-	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration);
+	bpf_prog_change_xdp(NULL, prog);
+	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
 	if (ret)
 		goto out;
 	if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
@@ -425,6 +451,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 		size = xdp.data_end - xdp.data;
 	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
 out:
+	bpf_prog_change_xdp(prog, NULL);
 	kfree(data);
 	return ret;
 }
@@ -437,8 +464,7 @@ static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
 
 	/* flags is allowed */
 
-	if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) +
-			   sizeof_field(struct bpf_flow_keys, flags),
+	if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags),
 			   sizeof(struct bpf_flow_keys)))
 		return -EINVAL;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ad39c87b7fd..f1b8afcfbc0f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8542,7 +8542,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 			   struct netlink_ext_ack *extack, u32 flags,
 			   struct bpf_prog *prog)
 {
+	bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
+	struct bpf_prog *prev_prog = NULL;
 	struct netdev_bpf xdp;
+	int err;
+
+	if (non_hw) {
+		prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
+							   XDP_QUERY_PROG));
+		if (IS_ERR(prev_prog))
+			prev_prog = NULL;
+	}
 
 	memset(&xdp, 0, sizeof(xdp));
 	if (flags & XDP_FLAGS_HW_MODE)
@@ -8553,7 +8563,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 	xdp.flags = flags;
 	xdp.prog = prog;
 
-	return bpf_op(dev, &xdp);
+	err = bpf_op(dev, &xdp);
+	if (!err && non_hw)
+		bpf_prog_change_xdp(prev_prog, prog);
+
+	if (prev_prog)
+		bpf_prog_put(prev_prog);
+
+	return err;
 }
 
 static void dev_xdp_uninstall(struct net_device *dev)
diff --git a/net/core/filter.c b/net/core/filter.c
index 28b3c258188c..42fd17c48c5f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3511,36 +3511,16 @@ err:
 }
 
 static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
-			    struct bpf_map *map,
-			    struct xdp_buff *xdp,
-			    u32 index)
+			    struct bpf_map *map, struct xdp_buff *xdp)
 {
-	int err;
-
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
-	case BPF_MAP_TYPE_DEVMAP_HASH: {
-		struct bpf_dtab_netdev *dst = fwd;
-
-		err = dev_map_enqueue(dst, xdp, dev_rx);
-		if (unlikely(err))
-			return err;
-		break;
-	}
-	case BPF_MAP_TYPE_CPUMAP: {
-		struct bpf_cpu_map_entry *rcpu = fwd;
-
-		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
-		if (unlikely(err))
-			return err;
-		break;
-	}
-	case BPF_MAP_TYPE_XSKMAP: {
-		struct xdp_sock *xs = fwd;
-
-		err = __xsk_map_redirect(map, xdp, xs);
-		return err;
-	}
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+		return dev_map_enqueue(fwd, xdp, dev_rx);
+	case BPF_MAP_TYPE_CPUMAP:
+		return cpu_map_enqueue(fwd, xdp, dev_rx);
+	case BPF_MAP_TYPE_XSKMAP:
+		return __xsk_map_redirect(fwd, xdp);
 	default:
 		break;
 	}
@@ -3549,26 +3529,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 void xdp_do_flush_map(void)
 {
-	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	struct bpf_map *map = ri->map_to_flush;
-
-	ri->map_to_flush = NULL;
-	if (map) {
-		switch (map->map_type) {
-		case BPF_MAP_TYPE_DEVMAP:
-		case BPF_MAP_TYPE_DEVMAP_HASH:
-			__dev_map_flush(map);
-			break;
-		case BPF_MAP_TYPE_CPUMAP:
-			__cpu_map_flush(map);
-			break;
-		case BPF_MAP_TYPE_XSKMAP:
-			__xsk_map_flush(map);
-			break;
-		default:
-			break;
-		}
-	}
+	__dev_map_flush();
+	__cpu_map_flush();
+	__xsk_map_flush();
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
@@ -3617,14 +3580,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	ri->tgt_value = NULL;
 	WRITE_ONCE(ri->map, NULL);
 
-	if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
-		xdp_do_flush_map();
-
-	err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
+	err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
 	if (unlikely(err))
 		goto err;
 
-	ri->map_to_flush = map;
 	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
 	return 0;
 err:
@@ -8941,3 +8900,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
 const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
 #endif /* CONFIG_INET */
+
+DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+{
+	bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
+				   prev_prog, prog);
+}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 328f661b83b2..02ada7ab8c6e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -31,6 +31,8 @@
 
 #define TX_BATCH_SIZE 16
 
+static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
+
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
@@ -39,21 +41,21 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
 {
-	return xskq_has_addrs(umem->fq, cnt);
+	return xskq_cons_has_entries(umem->fq, cnt);
 }
 EXPORT_SYMBOL(xsk_umem_has_addrs);
 
-u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 {
-	return xskq_peek_addr(umem->fq, addr, umem);
+	return xskq_cons_peek_addr(umem->fq, addr, umem);
 }
 EXPORT_SYMBOL(xsk_umem_peek_addr);
 
-void xsk_umem_discard_addr(struct xdp_umem *umem)
+void xsk_umem_release_addr(struct xdp_umem *umem)
 {
-	xskq_discard_addr(umem->fq);
+	xskq_cons_release(umem->fq);
 }
-EXPORT_SYMBOL(xsk_umem_discard_addr);
+EXPORT_SYMBOL(xsk_umem_release_addr);
 
 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
 {
@@ -124,7 +126,7 @@ static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
 	void *to_buf = xdp_umem_get_data(umem, addr);
 
 	addr = xsk_umem_add_offset_to_addr(addr);
-	if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
+	if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
 		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
 		u64 page_start = addr & ~(PAGE_SIZE - 1);
 		u64 first_len = PAGE_SIZE - (addr - page_start);
@@ -146,7 +148,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 	u32 metalen;
 	int err;
 
-	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
+	if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 		xs->rx_dropped++;
 		return -ENOSPC;
@@ -165,9 +167,9 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 
 	offset += metalen;
 	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-	err = xskq_produce_batch_desc(xs->rx, addr, len);
+	err = xskq_prod_reserve_desc(xs->rx, addr, len);
 	if (!err) {
-		xskq_discard_addr(xs->umem->fq);
+		xskq_cons_release(xs->umem->fq);
 		xdp_return_buff(xdp);
 		return 0;
 	}
@@ -178,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 
 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
+	int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
 
 	if (err)
 		xs->rx_dropped++;
@@ -214,7 +216,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 static void xsk_flush(struct xdp_sock *xs)
 {
-	xskq_produce_flush_desc(xs->rx);
+	xskq_prod_submit(xs->rx);
 	xs->sk.sk_data_ready(&xs->sk);
 }
 
@@ -234,7 +236,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 		goto out_unlock;
 	}
 
-	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
+	if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 		err = -ENOSPC;
 		goto out_drop;
@@ -245,12 +247,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	memcpy(buffer, xdp->data_meta, len + metalen);
 
 	addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
-	err = xskq_produce_batch_desc(xs->rx, addr, len);
+	err = xskq_prod_reserve_desc(xs->rx, addr, len);
 	if (err)
 		goto out_drop;
 
-	xskq_discard_addr(xs->umem->fq);
-	xskq_produce_flush_desc(xs->rx);
+	xskq_cons_release(xs->umem->fq);
+	xskq_prod_submit(xs->rx);
 
 	spin_unlock_bh(&xs->rx_lock);
 
@@ -264,11 +266,9 @@ out_unlock:
 	return err;
 }
 
-int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
-		       struct xdp_sock *xs)
+int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 	int err;
 
 	err = xsk_rcv(xs, xdp);
@@ -281,10 +281,9 @@ int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
 	return 0;
 }
 
-void __xsk_map_flush(struct bpf_map *map)
+void __xsk_map_flush(void)
 {
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 	struct xdp_sock *xs, *tmp;
 
 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
@@ -295,7 +294,7 @@ void __xsk_map_flush(struct bpf_map *map)
 
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 {
-	xskq_produce_flush_addr_n(umem->cq, nb_entries);
+	xskq_prod_submit_n(umem->cq, nb_entries);
 }
 EXPORT_SYMBOL(xsk_umem_complete_tx);
 
@@ -317,13 +316,18 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
-		if (!xskq_peek_desc(xs->tx, desc, umem))
+		if (!xskq_cons_peek_desc(xs->tx, desc, umem))
 			continue;
 
-		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
+		/* This is the backpreassure mechanism for the Tx path.
+		 * Reserve space in the completion queue and only proceed
+		 * if there is space in it. This avoids having to implement
+		 * any buffering in the Tx path.
+		 */
+		if (xskq_prod_reserve_addr(umem->cq, desc->addr))
 			goto out;
 
-		xskq_discard_desc(xs->tx);
+		xskq_cons_release(xs->tx);
 		rcu_read_unlock();
 		return true;
 	}
@@ -358,7 +362,7 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 	unsigned long flags;
 
 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
-	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+	xskq_prod_submit_addr(xs->umem->cq, addr);
 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
 
 	sock_wfree(skb);
@@ -378,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk)
 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
 		goto out;
 
-	while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
+	while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
 		char *buffer;
 		u64 addr;
 		u32 len;
@@ -399,7 +403,12 @@ static int xsk_generic_xmit(struct sock *sk)
 		addr = desc.addr;
 		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
-		if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
+		/* This is the backpreassure mechanism for the Tx path.
+		 * Reserve space in the completion queue and only proceed
+		 * if there is space in it. This avoids having to implement
+		 * any buffering in the Tx path.
+		 */
+		if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
 			kfree_skb(skb);
 			goto out;
 		}
@@ -411,7 +420,7 @@ static int xsk_generic_xmit(struct sock *sk)
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
-		xskq_discard_desc(xs->tx);
+		xskq_cons_release(xs->tx);
 		/* Ignore NET_XMIT_CN as packet might have been sent */
 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
 			/* SKB completed but not sent */
@@ -477,9 +486,9 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
 			__xsk_sendmsg(sk);
 	}
 
-	if (xs->rx && !xskq_empty_desc(xs->rx))
+	if (xs->rx && !xskq_prod_is_empty(xs->rx))
 		mask |= EPOLLIN | EPOLLRDNORM;
-	if (xs->tx && !xskq_full_desc(xs->tx))
+	if (xs->tx && !xskq_cons_is_full(xs->tx))
 		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	return mask;
@@ -1183,7 +1192,7 @@ static struct pernet_operations xsk_net_ops = {
 
 static int __init xsk_init(void)
 {
-	int err;
+	int err, cpu;
 
 	err = proto_register(&xsk_proto, 0 /* no slab */);
 	if (err)
@@ -1201,6 +1210,8 @@ static int __init xsk_init(void)
 	if (err)
 		goto out_pernet;
 
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
 	return 0;
 
 out_pernet:
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index b66504592d9b..c90e9c1e3c63 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -18,14 +18,14 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
 	q->chunk_mask = chunk_mask;
 }
 
-static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
 {
-	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
-}
+	struct xdp_umem_ring *umem_ring;
+	struct xdp_rxtx_ring *rxtx_ring;
 
-static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
-{
-	return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc);
+	if (umem_queue)
+		return struct_size(umem_ring, desc, q->nentries);
+	return struct_size(rxtx_ring, desc, q->nentries);
 }
 
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
@@ -43,8 +43,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
 
 	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
 		    __GFP_COMP  | __GFP_NORETRY;
-	size = umem_queue ? xskq_umem_get_ring_size(q) :
-	       xskq_rxtx_get_ring_size(q);
+	size = xskq_get_ring_size(q, umem_queue);
 
 	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
 						      get_order(size));
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index eddae4688862..bec2af11853a 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -10,9 +10,6 @@
 #include <linux/if_xdp.h>
 #include <net/xdp_sock.h>
 
-#define RX_BATCH_SIZE 16
-#define LAZY_UPDATE_THRESHOLD 128
-
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
 	u32 consumer ____cacheline_aligned_in_smp;
@@ -36,10 +33,8 @@ struct xsk_queue {
 	u64 size;
 	u32 ring_mask;
 	u32 nentries;
-	u32 prod_head;
-	u32 prod_tail;
-	u32 cons_head;
-	u32 cons_tail;
+	u32 cached_prod;
+	u32 cached_cons;
 	struct xdp_ring *ring;
 	u64 invalid_descs;
 };
@@ -86,56 +81,31 @@ struct xsk_queue {
  * now and again after circling through the ring.
  */
 
-/* Common functions operating for both RXTX and umem queues */
-
-static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
-{
-	return q ? q->invalid_descs : 0;
-}
-
-static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
-{
-	u32 entries = q->prod_tail - q->cons_tail;
-
-	if (entries == 0) {
-		/* Refresh the local pointer */
-		q->prod_tail = READ_ONCE(q->ring->producer);
-		entries = q->prod_tail - q->cons_tail;
-	}
-
-	return (entries > dcnt) ? dcnt : entries;
-}
-
-static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
-{
-	u32 free_entries = q->nentries - (producer - q->cons_tail);
-
-	if (free_entries >= dcnt)
-		return free_entries;
-
-	/* Refresh the local tail pointer */
-	q->cons_tail = READ_ONCE(q->ring->consumer);
-	return q->nentries - (producer - q->cons_tail);
-}
-
-static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
-{
-	u32 entries = q->prod_tail - q->cons_tail;
-
-	if (entries >= cnt)
-		return true;
-
-	/* Refresh the local pointer. */
-	q->prod_tail = READ_ONCE(q->ring->producer);
-	entries = q->prod_tail - q->cons_tail;
-
-	return entries >= cnt;
-}
+/* The operations on the rings are the following:
+ *
+ * producer                           consumer
+ *
+ * RESERVE entries                    PEEK in the ring for entries
+ * WRITE data into the ring           READ data from the ring
+ * SUBMIT entries                     RELEASE entries
+ *
+ * The producer reserves one or more entries in the ring. It can then
+ * fill in these entries and finally submit them so that they can be
+ * seen and read by the consumer.
+ *
+ * The consumer peeks into the ring to see if the producer has written
+ * any new entries. If so, the producer can then read these entries
+ * and when it is done reading them release them back to the producer
+ * so that the producer can use these slots to fill in new entries.
+ *
+ * The function names below reflect these operations.
+ */
 
-/* UMEM queue */
+/* Functions that read and validate content from consumer rings. */
 
-static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
-					      u64 length)
+static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
+						   u64 addr,
+						   u64 length)
 {
 	bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
 	bool next_pg_contig =
@@ -145,9 +115,16 @@ static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
 	return cross_pg && !next_pg_contig;
 }
 
-static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
+static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
+						u64 addr,
+						u64 length,
+						struct xdp_umem *umem)
 {
-	if (addr >= q->size) {
+	u64 base_addr = xsk_umem_extract_addr(addr);
+
+	addr = xsk_umem_add_offset_to_addr(addr);
+	if (base_addr >= q->size || addr >= q->size ||
+	    xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -155,15 +132,9 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 	return true;
 }
 
-static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
-						u64 length,
-						struct xdp_umem *umem)
+static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	u64 base_addr = xsk_umem_extract_addr(addr);
-
-	addr = xsk_umem_add_offset_to_addr(addr);
-	if (base_addr >= q->size || addr >= q->size ||
-	    xskq_crosses_non_contig_pg(umem, addr, length)) {
+	if (addr >= q->size) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -171,204 +142,240 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
 	return true;
 }
 
-static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
-				      struct xdp_umem *umem)
+static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
+				       struct xdp_umem *umem)
 {
-	while (q->cons_tail != q->cons_head) {
-		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-		unsigned int idx = q->cons_tail & q->ring_mask;
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-		*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
+	while (q->cached_cons != q->cached_prod) {
+		u32 idx = q->cached_cons & q->ring_mask;
+
+		*addr = ring->desc[idx] & q->chunk_mask;
 
 		if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
-			if (xskq_is_valid_addr_unaligned(q, *addr,
+			if (xskq_cons_is_valid_unaligned(q, *addr,
 							 umem->chunk_size_nohr,
 							 umem))
-				return addr;
+				return true;
 			goto out;
 		}
 
-		if (xskq_is_valid_addr(q, *addr))
-			return addr;
+		if (xskq_cons_is_valid_addr(q, *addr))
+			return true;
 
 out:
-		q->cons_tail++;
+		q->cached_cons++;
 	}
 
-	return NULL;
+	return false;
 }
 
-static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
-				  struct xdp_umem *umem)
+static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
+					   struct xdp_desc *d,
+					   struct xdp_umem *umem)
 {
-	if (q->cons_tail == q->cons_head) {
-		smp_mb(); /* D, matches A */
-		WRITE_ONCE(q->ring->consumer, q->cons_tail);
-		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+		if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
+			return false;
+
+		if (d->len > umem->chunk_size_nohr || d->options) {
+			q->invalid_descs++;
+			return false;
+		}
 
-		/* Order consumer and data */
-		smp_rmb();
+		return true;
 	}
 
-	return xskq_validate_addr(q, addr, umem);
-}
+	if (!xskq_cons_is_valid_addr(q, d->addr))
+		return false;
 
-static inline void xskq_discard_addr(struct xsk_queue *q)
-{
-	q->cons_tail++;
+	if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
+	    d->options) {
+		q->invalid_descs++;
+		return false;
+	}
+
+	return true;
 }
 
-static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
+static inline bool xskq_cons_read_desc(struct xsk_queue *q,
+				       struct xdp_desc *desc,
+				       struct xdp_umem *umem)
 {
-	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
-	if (xskq_nb_free(q, q->prod_tail, 1) == 0)
-		return -ENOSPC;
+	while (q->cached_cons != q->cached_prod) {
+		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+		u32 idx = q->cached_cons & q->ring_mask;
 
-	/* A, matches D */
-	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
+		*desc = ring->desc[idx];
+		if (xskq_cons_is_valid_desc(q, desc, umem))
+			return true;
 
-	/* Order producer and data */
-	smp_wmb(); /* B, matches C */
+		q->cached_cons++;
+	}
 
-	WRITE_ONCE(q->ring->producer, q->prod_tail);
-	return 0;
+	return false;
 }
 
-static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+/* Functions for consumers */
+
+static inline void __xskq_cons_release(struct xsk_queue *q)
 {
-	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+	smp_mb(); /* D, matches A */
+	WRITE_ONCE(q->ring->consumer, q->cached_cons);
+}
 
-	if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
-		return -ENOSPC;
+static inline void __xskq_cons_peek(struct xsk_queue *q)
+{
+	/* Refresh the local pointer */
+	q->cached_prod = READ_ONCE(q->ring->producer);
+	smp_rmb(); /* C, matches B */
+}
 
-	/* A, matches D */
-	ring->desc[q->prod_head++ & q->ring_mask] = addr;
-	return 0;
+static inline void xskq_cons_get_entries(struct xsk_queue *q)
+{
+	__xskq_cons_release(q);
+	__xskq_cons_peek(q);
 }
 
-static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
-					     u32 nb_entries)
+static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
 {
-	/* Order producer and data */
-	smp_wmb(); /* B, matches C */
+	u32 entries = q->cached_prod - q->cached_cons;
 
-	q->prod_tail += nb_entries;
-	WRITE_ONCE(q->ring->producer, q->prod_tail);
+	if (entries >= cnt)
+		return true;
+
+	__xskq_cons_peek(q);
+	entries = q->cached_prod - q->cached_cons;
+
+	return entries >= cnt;
 }
 
-static inline int xskq_reserve_addr(struct xsk_queue *q)
+static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
+				       struct xdp_umem *umem)
 {
-	if (xskq_nb_free(q, q->prod_head, 1) == 0)
-		return -ENOSPC;
+	if (q->cached_prod == q->cached_cons)
+		xskq_cons_get_entries(q);
+	return xskq_cons_read_addr(q, addr, umem);
+}
 
-	/* A, matches D */
-	q->prod_head++;
-	return 0;
+static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
+				       struct xdp_desc *desc,
+				       struct xdp_umem *umem)
+{
+	if (q->cached_prod == q->cached_cons)
+		xskq_cons_get_entries(q);
+	return xskq_cons_read_desc(q, desc, umem);
 }
 
-/* Rx/Tx queue */
+static inline void xskq_cons_release(struct xsk_queue *q)
+{
+	/* To improve performance, only update local state here.
+	 * Reflect this to global state when we get new entries
+	 * from the ring in xskq_cons_get_entries().
+	 */
+	q->cached_cons++;
+}
 
-static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
-				      struct xdp_umem *umem)
+static inline bool xskq_cons_is_full(struct xsk_queue *q)
 {
-	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
-		if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
-			return false;
+	/* No barriers needed since data is not accessed */
+	return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) ==
+		q->nentries;
+}
 
-		if (d->len > umem->chunk_size_nohr || d->options) {
-			q->invalid_descs++;
-			return false;
-		}
+/* Functions for producers */
 
-		return true;
-	}
+static inline bool xskq_prod_is_full(struct xsk_queue *q)
+{
+	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
 
-	if (!xskq_is_valid_addr(q, d->addr))
+	if (free_entries)
 		return false;
 
-	if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
-	    d->options) {
-		q->invalid_descs++;
-		return false;
-	}
+	/* Refresh the local tail pointer */
+	q->cached_cons = READ_ONCE(q->ring->consumer);
+	free_entries = q->nentries - (q->cached_prod - q->cached_cons);
 
-	return true;
+	return !free_entries;
 }
 
-static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
-						  struct xdp_desc *desc,
-						  struct xdp_umem *umem)
+static inline int xskq_prod_reserve(struct xsk_queue *q)
 {
-	while (q->cons_tail != q->cons_head) {
-		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
-		unsigned int idx = q->cons_tail & q->ring_mask;
-
-		*desc = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_desc(q, desc, umem))
-			return desc;
-
-		q->cons_tail++;
-	}
+	if (xskq_prod_is_full(q))
+		return -ENOSPC;
 
-	return NULL;
+	/* A, matches D */
+	q->cached_prod++;
+	return 0;
 }
 
-static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
-					      struct xdp_desc *desc,
-					      struct xdp_umem *umem)
+static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 {
-	if (q->cons_tail == q->cons_head) {
-		smp_mb(); /* D, matches A */
-		WRITE_ONCE(q->ring->consumer, q->cons_tail);
-		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
-
-		/* Order consumer and data */
-		smp_rmb(); /* C, matches B */
-	}
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-	return xskq_validate_desc(q, desc, umem);
-}
+	if (xskq_prod_is_full(q))
+		return -ENOSPC;
 
-static inline void xskq_discard_desc(struct xsk_queue *q)
-{
-	q->cons_tail++;
+	/* A, matches D */
+	ring->desc[q->cached_prod++ & q->ring_mask] = addr;
+	return 0;
 }
 
-static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-					  u64 addr, u32 len)
+static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
+					 u64 addr, u32 len)
 {
 	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
-	unsigned int idx;
+	u32 idx;
 
-	if (xskq_nb_free(q, q->prod_head, 1) == 0)
+	if (xskq_prod_is_full(q))
 		return -ENOSPC;
 
 	/* A, matches D */
-	idx = (q->prod_head++) & q->ring_mask;
+	idx = q->cached_prod++ & q->ring_mask;
 	ring->desc[idx].addr = addr;
 	ring->desc[idx].len = len;
 
 	return 0;
 }
 
-static inline void xskq_produce_flush_desc(struct xsk_queue *q)
+static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx)
 {
-	/* Order producer and data */
 	smp_wmb(); /* B, matches C */
 
-	q->prod_tail = q->prod_head;
-	WRITE_ONCE(q->ring->producer, q->prod_tail);
+	WRITE_ONCE(q->ring->producer, idx);
+}
+
+static inline void xskq_prod_submit(struct xsk_queue *q)
+{
+	__xskq_prod_submit(q, q->cached_prod);
+}
+
+static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+	u32 idx = q->ring->producer;
+
+	ring->desc[idx++ & q->ring_mask] = addr;
+
+	__xskq_prod_submit(q, idx);
 }
 
-static inline bool xskq_full_desc(struct xsk_queue *q)
+static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries)
 {
-	return xskq_nb_avail(q, q->nentries) == q->nentries;
+	__xskq_prod_submit(q, q->ring->producer + nb_entries);
 }
 
-static inline bool xskq_empty_desc(struct xsk_queue *q)
+static inline bool xskq_prod_is_empty(struct xsk_queue *q)
 {
-	return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
+	/* No barriers needed since data is not accessed */
+	return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer);
+}
+
+/* For both producers and consumers */
+
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+	return q ? q->invalid_descs : 0;
 }
 
 void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1fc42ad8ff49..5b89c0370f33 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -38,6 +38,8 @@ tprogs-y += tc_l2_redirect
 tprogs-y += lwt_len_hist
 tprogs-y += xdp_tx_iptunnel
 tprogs-y += test_map_in_map
+tprogs-y += per_socket_stats_example
+tprogs-y += xdp_redirect
 tprogs-y += xdp_redirect_map
 tprogs-y += xdp_redirect_cpu
 tprogs-y += xdp_monitor
@@ -196,7 +198,7 @@ endif
 
 TPROGCFLAGS_bpf_load.o += -Wno-unused-variable
 
-TPROGS_LDLIBS			+= $(LIBBPF) -lelf
+TPROGS_LDLIBS			+= $(LIBBPF) -lelf -lz
 TPROGLDLIBS_tracex4		+= -lrt
 TPROGLDLIBS_trace_output	+= -lrt
 TPROGLDLIBS_map_perf_test	+= -lrt
@@ -234,6 +236,7 @@ BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
 			  readelf -S ./llvm_btf_verify.o | grep BTF; \
 			  /bin/rm -f ./llvm_btf_verify.o)
 
+BPF_EXTRA_CFLAGS += -fno-stack-protector
 ifneq ($(BTF_LLVM_PROBE),)
 	BPF_EXTRA_CFLAGS += -g
 else
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 3e553eed95a7..38a8852cb57f 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -98,7 +98,7 @@ int main(int argc, char **argv)
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
 		case 'N':
-			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			/* default, set below */
 			break;
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -109,6 +109,9 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	if (optind == argc) {
 		usage(basename(argv[0]));
 		return 1;
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
index d86e9ad0356b..008789eb6ada 100644
--- a/samples/bpf/xdp_adjust_tail_user.c
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -120,7 +120,7 @@ int main(int argc, char **argv)
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
 		case 'N':
-			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			/* default, set below */
 			break;
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -132,6 +132,9 @@ int main(int argc, char **argv)
 		opt_flags[opt] = 0;
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	for (i = 0; i < strlen(optstr); i++) {
 		if (opt_flags[(unsigned int)optstr[i]]) {
 			fprintf(stderr, "Missing argument -%c\n", optstr[i]);
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index 97ff1dad7669..c30f9acfdb84 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -27,11 +27,13 @@
 #include "libbpf.h"
 #include <bpf/bpf.h>
 
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+
 static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
 {
 	int err;
 
-	err = bpf_set_link_xdp_fd(idx, prog_fd, 0);
+	err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags);
 	if (err < 0) {
 		printf("ERROR: failed to attach program to %s\n", name);
 		return err;
@@ -49,7 +51,7 @@ static int do_detach(int idx, const char *name)
 {
 	int err;
 
-	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	err = bpf_set_link_xdp_fd(idx, -1, xdp_flags);
 	if (err < 0)
 		printf("ERROR: failed to detach program from %s\n", name);
 
@@ -83,11 +85,17 @@ int main(int argc, char **argv)
 	int attach = 1;
 	int ret = 0;
 
-	while ((opt = getopt(argc, argv, ":dD")) != -1) {
+	while ((opt = getopt(argc, argv, ":dDSF")) != -1) {
 		switch (opt) {
 		case 'd':
 			attach = 0;
 			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'F':
+			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+			break;
 		case 'D':
 			prog_name = "xdp_fwd_direct";
 			break;
@@ -97,6 +105,9 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	if (optind == argc) {
 		usage(basename(argv[0]));
 		return 1;
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 0da6e9e7132e..79a2fb7d16cb 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -16,6 +16,10 @@ static const char *__doc__ =
 #include <getopt.h>
 #include <net/if.h>
 #include <time.h>
+#include <linux/limits.h>
+
+#define __must_check
+#include <linux/err.h>
 
 #include <arpa/inet.h>
 #include <linux/if_link.h>
@@ -46,6 +50,10 @@ static int cpus_count_map_fd;
 static int cpus_iterator_map_fd;
 static int exception_cnt_map_fd;
 
+#define NUM_TP 5
+struct bpf_link *tp_links[NUM_TP] = { 0 };
+static int tp_cnt = 0;
+
 /* Exit return codes */
 #define EXIT_OK		0
 #define EXIT_FAIL		1
@@ -88,6 +96,10 @@ static void int_exit(int sig)
 			printf("program on interface changed, not removing\n");
 		}
 	}
+	/* Detach tracepoints */
+	while (tp_cnt)
+		bpf_link__destroy(tp_links[--tp_cnt]);
+
 	exit(EXIT_OK);
 }
 
@@ -588,23 +600,61 @@ static void stats_poll(int interval, bool use_separators, char *prog_name,
 	free_stats_record(prev);
 }
 
+static struct bpf_link * attach_tp(struct bpf_object *obj,
+				   const char *tp_category,
+				   const char* tp_name)
+{
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	char sec_name[PATH_MAX];
+	int len;
+
+	len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s",
+		       tp_category, tp_name);
+	if (len < 0)
+		exit(EXIT_FAIL);
+
+	prog = bpf_object__find_program_by_title(obj, sec_name);
+	if (!prog) {
+		fprintf(stderr, "ERR: finding progsec: %s\n", sec_name);
+		exit(EXIT_FAIL_BPF);
+	}
+
+	link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
+	if (IS_ERR(link))
+		exit(EXIT_FAIL_BPF);
+
+	return link;
+}
+
+static void init_tracepoints(struct bpf_object *obj) {
+	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err");
+	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err");
+	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception");
+	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue");
+	tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread");
+}
+
 static int init_map_fds(struct bpf_object *obj)
 {
-	cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
-	rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
+	/* Maps updated by tracepoints */
 	redirect_err_cnt_map_fd =
 		bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt");
+	exception_cnt_map_fd =
+		bpf_object__find_map_fd_by_name(obj, "exception_cnt");
 	cpumap_enqueue_cnt_map_fd =
 		bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt");
 	cpumap_kthread_cnt_map_fd =
 		bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt");
+
+	/* Maps used by XDP */
+	rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
+	cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
 	cpus_available_map_fd =
 		bpf_object__find_map_fd_by_name(obj, "cpus_available");
 	cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count");
 	cpus_iterator_map_fd =
 		bpf_object__find_map_fd_by_name(obj, "cpus_iterator");
-	exception_cnt_map_fd =
-		bpf_object__find_map_fd_by_name(obj, "exception_cnt");
 
 	if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 ||
 	    redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 ||
@@ -662,6 +712,7 @@ int main(int argc, char **argv)
 			strerror(errno));
 		return EXIT_FAIL;
 	}
+	init_tracepoints(obj);
 	if (init_map_fds(obj) < 0) {
 		fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
 		return EXIT_FAIL;
@@ -728,6 +779,10 @@ int main(int argc, char **argv)
 			return EXIT_FAIL_OPTION;
 		}
 	}
+
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	/* Required option */
 	if (ifindex == -1) {
 		fprintf(stderr, "ERR: required option --dev missing\n");
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index f70ee33907fd..cc840661faab 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
 		case 'N':
-			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			/* default, set below */
 			break;
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -127,6 +127,9 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	if (optind == argc) {
 		printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
 		return 1;
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index 5440cd620607..71dff8e3382a 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -117,7 +117,7 @@ int main(int argc, char **argv)
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
 		case 'N':
-			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			/* default, set below */
 			break;
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -128,6 +128,9 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	if (optind == argc) {
 		printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
 		return 1;
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 1469b66ebad1..fef286c5add2 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -662,6 +662,9 @@ int main(int ac, char **argv)
 		}
 	}
 
+	if (!(flags & XDP_FLAGS_SKB_MODE))
+		flags |= XDP_FLAGS_DRV_MODE;
+
 	if (optind == ac) {
 		usage(basename(argv[0]));
 		return 1;
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
index 8fc3ad01de72..fc4983fd6959 100644
--- a/samples/bpf/xdp_rxq_info_user.c
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -551,6 +551,10 @@ int main(int argc, char **argv)
 			return EXIT_FAIL_OPTION;
 		}
 	}
+
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	/* Required option */
 	if (ifindex == -1) {
 		fprintf(stderr, "ERR: required option --dev missing\n");
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
index a5760e8bf2c4..8c1af1b7372d 100644
--- a/samples/bpf/xdp_sample_pkts_user.c
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -52,13 +52,13 @@ static int do_detach(int idx, const char *name)
 	__u32 curr_prog_id = 0;
 	int err = 0;
 
-	err = bpf_get_link_xdp_id(idx, &curr_prog_id, 0);
+	err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags);
 	if (err) {
 		printf("bpf_get_link_xdp_id failed\n");
 		return err;
 	}
 	if (prog_id == curr_prog_id) {
-		err = bpf_set_link_xdp_fd(idx, -1, 0);
+		err = bpf_set_link_xdp_fd(idx, -1, xdp_flags);
 		if (err < 0)
 			printf("ERROR: failed to detach prog from %s\n", name);
 	} else if (!curr_prog_id) {
@@ -115,7 +115,7 @@ int main(int argc, char **argv)
 		.prog_type	= BPF_PROG_TYPE_XDP,
 	};
 	struct perf_buffer_opts pb_opts = {};
-	const char *optstr = "F";
+	const char *optstr = "FS";
 	int prog_fd, map_fd, opt;
 	struct bpf_object *obj;
 	struct bpf_map *map;
@@ -127,12 +127,18 @@ int main(int argc, char **argv)
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
 			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
 		default:
 			usage(basename(argv[0]));
 			return 1;
 		}
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	if (optind == argc) {
 		usage(basename(argv[0]));
 		return 1;
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 2fe4c7f5ffe5..5f33b5530032 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -231,7 +231,7 @@ int main(int argc, char **argv)
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
 		case 'N':
-			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			/* default, set below */
 			break;
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -243,6 +243,9 @@ int main(int argc, char **argv)
 		opt_flags[opt] = 0;
 	}
 
+	if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+		xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	for (i = 0; i < strlen(optstr); i++) {
 		if (opt_flags[(unsigned int)optstr[i]]) {
 			fprintf(stderr, "Missing argument -%c\n", optstr[i]);
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index a15480010828..d74c4c83fc93 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -10,6 +10,9 @@
 #include <linux/if_link.h>
 #include <linux/if_xdp.h>
 #include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <arpa/inet.h>
 #include <locale.h>
 #include <net/ethernet.h>
 #include <net/if.h>
@@ -45,12 +48,14 @@
 #endif
 
 #define NUM_FRAMES (4 * 1024)
-#define BATCH_SIZE 64
+#define MIN_PKT_SIZE 64
 
 #define DEBUG_HEXDUMP 0
 
 typedef __u64 u64;
 typedef __u32 u32;
+typedef __u16 u16;
+typedef __u8  u8;
 
 static unsigned long prev_time;
 
@@ -65,6 +70,13 @@ static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 static const char *opt_if = "";
 static int opt_ifindex;
 static int opt_queue;
+static unsigned long opt_duration;
+static unsigned long start_time;
+static bool benchmark_done;
+static u32 opt_batch_size = 64;
+static int opt_pkt_count;
+static u16 opt_pkt_size = MIN_PKT_SIZE;
+static u32 opt_pkt_fill_pattern = 0x12345678;
 static int opt_poll;
 static int opt_interval = 1;
 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
@@ -167,10 +179,21 @@ static void dump_stats(void)
 	}
 }
 
+static bool is_benchmark_done(void)
+{
+	if (opt_duration > 0) {
+		unsigned long dt = (get_nsecs() - start_time);
+
+		if (dt >= opt_duration)
+			benchmark_done = true;
+	}
+	return benchmark_done;
+}
+
 static void *poller(void *arg)
 {
 	(void)arg;
-	for (;;) {
+	while (!is_benchmark_done()) {
 		sleep(opt_interval);
 		dump_stats();
 	}
@@ -196,6 +219,11 @@ static void remove_xdp_program(void)
 
 static void int_exit(int sig)
 {
+	benchmark_done = true;
+}
+
+static void xdpsock_cleanup(void)
+{
 	struct xsk_umem *umem = xsks[0]->umem->umem;
 	int i;
 
@@ -204,8 +232,6 @@ static void int_exit(int sig)
 		xsk_socket__delete(xsks[i]->xsk);
 	(void)xsk_umem__delete(umem);
 	remove_xdp_program();
-
-	exit(EXIT_SUCCESS);
 }
 
 static void __exit_with_error(int error, const char *file, const char *func,
@@ -220,13 +246,6 @@ static void __exit_with_error(int error, const char *file, const char *func,
 
 #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
 						 __LINE__)
-
-static const char pkt_data[] =
-	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
-	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
-	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
-	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
-
 static void swap_mac_addresses(void *data)
 {
 	struct ether_header *eth = (struct ether_header *)data;
@@ -274,11 +293,243 @@ static void hex_dump(void *pkt, size_t length, u64 addr)
 	printf("\n");
 }
 
-static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
+static void *memset32_htonl(void *dest, u32 val, u32 size)
+{
+	u32 *ptr = (u32 *)dest;
+	int i;
+
+	val = htonl(val);
+
+	for (i = 0; i < (size & (~0x3)); i += 4)
+		ptr[i >> 2] = val;
+
+	for (; i < size; i++)
+		((char *)dest)[i] = ((char *)&val)[i & 3];
+
+	return dest;
+}
+
+/*
+ * This function code has been taken from
+ * Linux kernel lib/checksum.c
+ */
+static inline unsigned short from32to16(unsigned int x)
+{
+	/* add up 16-bit and 16-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+/*
+ * This function code has been taken from
+ * Linux kernel lib/checksum.c
+ */
+static unsigned int do_csum(const unsigned char *buff, int len)
+{
+	unsigned int result = 0;
+	int odd;
+
+	if (len <= 0)
+		goto out;
+	odd = 1 & (unsigned long)buff;
+	if (odd) {
+#ifdef __LITTLE_ENDIAN
+		result += (*buff << 8);
+#else
+		result = *buff;
+#endif
+		len--;
+		buff++;
+	}
+	if (len >= 2) {
+		if (2 & (unsigned long)buff) {
+			result += *(unsigned short *)buff;
+			len -= 2;
+			buff += 2;
+		}
+		if (len >= 4) {
+			const unsigned char *end = buff +
+						   ((unsigned int)len & ~3);
+			unsigned int carry = 0;
+
+			do {
+				unsigned int w = *(unsigned int *)buff;
+
+				buff += 4;
+				result += carry;
+				result += w;
+				carry = (w > result);
+			} while (buff < end);
+			result += carry;
+			result = (result & 0xffff) + (result >> 16);
+		}
+		if (len & 2) {
+			result += *(unsigned short *)buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+#ifdef __LITTLE_ENDIAN
+		result += *buff;
+#else
+		result += (*buff << 8);
+#endif
+	result = from32to16(result);
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+out:
+	return result;
+}
+
+__sum16 ip_fast_csum(const void *iph, unsigned int ihl);
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ *	This function code has been taken from
+ *	Linux kernel lib/checksum.c
+ */
+__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	return (__force __sum16)~do_csum(iph, ihl * 4);
+}
+
+/*
+ * Fold a partial checksum
+ * This function code has been taken from
+ * Linux kernel include/asm-generic/checksum.h
+ */
+static inline __sum16 csum_fold(__wsum csum)
+{
+	u32 sum = (__force u32)csum;
+
+	sum = (sum & 0xffff) + (sum >> 16);
+	sum = (sum & 0xffff) + (sum >> 16);
+	return (__force __sum16)~sum;
+}
+
+/*
+ * This function code has been taken from
+ * Linux kernel lib/checksum.c
+ */
+static inline u32 from64to32(u64 x)
+{
+	/* add up 32-bit and 32-bit for 32+c bit */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up carry.. */
+	x = (x & 0xffffffff) + (x >> 32);
+	return (u32)x;
+}
+
+__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum);
+
+/*
+ * This function code has been taken from
+ * Linux kernel lib/checksum.c
+ */
+__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+			  __u32 len, __u8 proto, __wsum sum)
+{
+	unsigned long long s = (__force u32)sum;
+
+	s += (__force u32)saddr;
+	s += (__force u32)daddr;
+#ifdef __BIG_ENDIAN__
+	s += proto + len;
+#else
+	s += (proto + len) << 8;
+#endif
+	return (__force __wsum)from64to32(s);
+}
+
+/*
+ * This function has been taken from
+ * Linux kernel include/asm-generic/checksum.h
+ */
+static inline __sum16
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
+{
+	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len,
+			   u8 proto, u16 *udp_pkt)
+{
+	u32 csum = 0;
+	u32 cnt = 0;
+
+	/* udp hdr and data */
+	for (; cnt < len; cnt += 2)
+		csum += udp_pkt[cnt >> 1];
+
+	return csum_tcpudp_magic(saddr, daddr, len, proto, csum);
+}
+
+#define ETH_FCS_SIZE 4
+
+#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+		      sizeof(struct udphdr))
+
+#define PKT_SIZE		(opt_pkt_size - ETH_FCS_SIZE)
+#define IP_PKT_SIZE		(PKT_SIZE - sizeof(struct ethhdr))
+#define UDP_PKT_SIZE		(IP_PKT_SIZE - sizeof(struct iphdr))
+#define UDP_PKT_DATA_SIZE	(UDP_PKT_SIZE - sizeof(struct udphdr))
+
+static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
+
+static void gen_eth_hdr_data(void)
+{
+	struct udphdr *udp_hdr = (struct udphdr *)(pkt_data +
+						   sizeof(struct ethhdr) +
+						   sizeof(struct iphdr));
+	struct iphdr *ip_hdr = (struct iphdr *)(pkt_data +
+						sizeof(struct ethhdr));
+	struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
+
+	/* ethernet header */
+	memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
+	memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
+	eth_hdr->h_proto = htons(ETH_P_IP);
+
+	/* IP header */
+	ip_hdr->version = IPVERSION;
+	ip_hdr->ihl = 0x5; /* 20 byte header */
+	ip_hdr->tos = 0x0;
+	ip_hdr->tot_len = htons(IP_PKT_SIZE);
+	ip_hdr->id = 0;
+	ip_hdr->frag_off = 0;
+	ip_hdr->ttl = IPDEFTTL;
+	ip_hdr->protocol = IPPROTO_UDP;
+	ip_hdr->saddr = htonl(0x0a0a0a10);
+	ip_hdr->daddr = htonl(0x0a0a0a20);
+
+	/* IP header checksum */
+	ip_hdr->check = 0;
+	ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl);
+
+	/* UDP header */
+	udp_hdr->source = htons(0x1000);
+	udp_hdr->dest = htons(0x1000);
+	udp_hdr->len = htons(UDP_PKT_SIZE);
+
+	/* UDP data */
+	memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern,
+		       UDP_PKT_DATA_SIZE);
+
+	/* UDP header checksum */
+	udp_hdr->check = 0;
+	udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE,
+				  IPPROTO_UDP, (u16 *)udp_hdr);
+}
+
+static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
 {
 	memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data,
-	       sizeof(pkt_data) - 1);
-	return sizeof(pkt_data) - 1;
+	       PKT_SIZE);
 }
 
 static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
@@ -375,6 +626,11 @@ static struct option long_options[] = {
 	{"unaligned", no_argument, 0, 'u'},
 	{"shared-umem", no_argument, 0, 'M'},
 	{"force", no_argument, 0, 'F'},
+	{"duration", required_argument, 0, 'd'},
+	{"batch-size", required_argument, 0, 'b'},
+	{"tx-pkt-count", required_argument, 0, 'C'},
+	{"tx-pkt-size", required_argument, 0, 's'},
+	{"tx-pkt-pattern", required_argument, 0, 'P'},
 	{0, 0, 0, 0}
 };
 
@@ -399,8 +655,21 @@ static void usage(const char *prog)
 		"  -u, --unaligned	Enable unaligned chunk placement\n"
 		"  -M, --shared-umem	Enable XDP_SHARED_UMEM\n"
 		"  -F, --force		Force loading the XDP prog\n"
+		"  -d, --duration=n	Duration in secs to run command.\n"
+		"			Default: forever.\n"
+		"  -b, --batch-size=n	Batch size for sending or receiving\n"
+		"			packets. Default: %d\n"
+		"  -C, --tx-pkt-count=n	Number of packets to send.\n"
+		"			Default: Continuous packets.\n"
+		"  -s, --tx-pkt-size=n	Transmit packet size.\n"
+		"			(Default: %d bytes)\n"
+		"			Min size: %d, Max size %d.\n"
+		"  -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
 		"\n";
-	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
+	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
+		opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
+		XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern);
+
 	exit(EXIT_FAILURE);
 }
 
@@ -411,7 +680,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:muM",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -440,7 +709,7 @@ static void parse_command_line(int argc, char **argv)
 			opt_xdp_bind_flags |= XDP_COPY;
 			break;
 		case 'N':
-			opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
+			/* default, set below */
 			break;
 		case 'n':
 			opt_interval = atoi(optarg);
@@ -469,11 +738,37 @@ static void parse_command_line(int argc, char **argv)
 		case 'M':
 			opt_num_xsks = MAX_SOCKS;
 			break;
+		case 'd':
+			opt_duration = atoi(optarg);
+			opt_duration *= 1000000000;
+			break;
+		case 'b':
+			opt_batch_size = atoi(optarg);
+			break;
+		case 'C':
+			opt_pkt_count = atoi(optarg);
+			break;
+		case 's':
+			opt_pkt_size = atoi(optarg);
+			if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) ||
+			    opt_pkt_size < MIN_PKT_SIZE) {
+				fprintf(stderr,
+					"ERROR: Invalid frame size %d\n",
+					opt_pkt_size);
+				usage(basename(argv[0]));
+			}
+			break;
+		case 'P':
+			opt_pkt_fill_pattern = strtol(optarg, NULL, 16);
+			break;
 		default:
 			usage(basename(argv[0]));
 		}
 	}
 
+	if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE))
+		opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
+
 	opt_ifindex = if_nametoindex(opt_if);
 	if (!opt_ifindex) {
 		fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
@@ -513,7 +808,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
 		kick_tx(xsk);
 
-	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
+	ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
 		xsk->outstanding_tx;
 
 	/* re-add completed Tx buffers */
@@ -542,7 +837,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
 	}
 }
 
-static inline void complete_tx_only(struct xsk_socket_info *xsk)
+static inline void complete_tx_only(struct xsk_socket_info *xsk,
+				    int batch_size)
 {
 	unsigned int rcvd;
 	u32 idx;
@@ -553,7 +849,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
 		kick_tx(xsk);
 
-	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
+	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
 	if (rcvd > 0) {
 		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
@@ -567,7 +863,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 	u32 idx_rx = 0, idx_fq = 0;
 	int ret;
 
-	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+	rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
 	if (!rcvd) {
 		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
 			ret = poll(fds, num_socks, opt_timeout);
@@ -619,36 +915,68 @@ static void rx_drop_all(void)
 
 		for (i = 0; i < num_socks; i++)
 			rx_drop(xsks[i], fds);
+
+		if (benchmark_done)
+			break;
 	}
 }
 
-static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
+static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size)
 {
 	u32 idx;
+	unsigned int i;
 
-	if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
-		unsigned int i;
-
-		for (i = 0; i < BATCH_SIZE; i++) {
-			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr	=
-				(frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
-			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
-				sizeof(pkt_data) - 1;
-		}
+	while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) <
+				      batch_size) {
+		complete_tx_only(xsk, batch_size);
+	}
 
-		xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
-		xsk->outstanding_tx += BATCH_SIZE;
-		frame_nb += BATCH_SIZE;
-		frame_nb %= NUM_FRAMES;
+	for (i = 0; i < batch_size; i++) {
+		struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
+								  idx + i);
+		tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
+		tx_desc->len = PKT_SIZE;
 	}
 
-	complete_tx_only(xsk);
+	xsk_ring_prod__submit(&xsk->tx, batch_size);
+	xsk->outstanding_tx += batch_size;
+	frame_nb += batch_size;
+	frame_nb %= NUM_FRAMES;
+	complete_tx_only(xsk, batch_size);
+}
+
+static inline int get_batch_size(int pkt_cnt)
+{
+	if (!opt_pkt_count)
+		return opt_batch_size;
+
+	if (pkt_cnt + opt_batch_size <= opt_pkt_count)
+		return opt_batch_size;
+
+	return opt_pkt_count - pkt_cnt;
+}
+
+static void complete_tx_only_all(void)
+{
+	bool pending;
+	int i;
+
+	do {
+		pending = false;
+		for (i = 0; i < num_socks; i++) {
+			if (xsks[i]->outstanding_tx) {
+				complete_tx_only(xsks[i], opt_batch_size);
+				pending = !!xsks[i]->outstanding_tx;
+			}
+		}
+	} while (pending);
 }
 
 static void tx_only_all(void)
 {
 	struct pollfd fds[MAX_SOCKS] = {};
 	u32 frame_nb[MAX_SOCKS] = {};
+	int pkt_cnt = 0;
 	int i, ret;
 
 	for (i = 0; i < num_socks; i++) {
@@ -656,7 +984,9 @@ static void tx_only_all(void)
 		fds[0].events = POLLOUT;
 	}
 
-	for (;;) {
+	while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
+		int batch_size = get_batch_size(pkt_cnt);
+
 		if (opt_poll) {
 			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
@@ -667,8 +997,16 @@ static void tx_only_all(void)
 		}
 
 		for (i = 0; i < num_socks; i++)
-			tx_only(xsks[i], frame_nb[i]);
+			tx_only(xsks[i], frame_nb[i], batch_size);
+
+		pkt_cnt += batch_size;
+
+		if (benchmark_done)
+			break;
 	}
+
+	if (opt_pkt_count)
+		complete_tx_only_all();
 }
 
 static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
@@ -679,7 +1017,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 
 	complete_tx_l2fwd(xsk, fds);
 
-	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+	rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
 	if (!rcvd) {
 		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
 			ret = poll(fds, num_socks, opt_timeout);
@@ -736,6 +1074,9 @@ static void l2fwd_all(void)
 
 		for (i = 0; i < num_socks; i++)
 			l2fwd(xsks[i], fds);
+
+		if (benchmark_done)
+			break;
 	}
 }
 
@@ -831,9 +1172,12 @@ int main(int argc, char **argv)
 	for (i = 0; i < opt_num_xsks; i++)
 		xsks[num_socks++] = xsk_configure_socket(umem, rx, tx);
 
-	if (opt_bench == BENCH_TXONLY)
+	if (opt_bench == BENCH_TXONLY) {
+		gen_eth_hdr_data();
+
 		for (i = 0; i < NUM_FRAMES; i++)
 			gen_eth_frame(umem, i * opt_xsk_frame_size);
+	}
 
 	if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY)
 		enter_xsks_into_map(obj);
@@ -849,6 +1193,7 @@ int main(int argc, char **argv)
 		exit_with_error(ret);
 
 	prev_time = get_nsecs();
+	start_time = prev_time;
 
 	if (opt_bench == BENCH_RXDROP)
 		rx_drop_all();
@@ -857,5 +1202,11 @@ int main(int argc, char **argv)
 	else
 		l2fwd_all();
 
+	benchmark_done = true;
+
+	pthread_join(pt, NULL);
+
+	xdpsock_cleanup();
+
 	return 0;
 }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
new file mode 100644
index 000000000000..86a87da97d0b
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -0,0 +1,305 @@
+================
+bpftool-gen
+================
+-------------------------------------------------------------------------------
+tool for BPF code-generation
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **gen** *COMMAND*
+
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
+
+	*COMMAND* := { **skeleton | **help** }
+
+GEN COMMANDS
+=============
+
+|	**bpftool** **gen skeleton** *FILE*
+|	**bpftool** **gen help**
+
+DESCRIPTION
+===========
+	**bpftool gen skeleton** *FILE*
+		  Generate BPF skeleton C header file for a given *FILE*.
+
+		  BPF skeleton is an alternative interface to existing libbpf
+		  APIs for working with BPF objects. Skeleton code is intended
+		  to significantly shorten and simplify code to load and work
+		  with BPF programs from userspace side. Generated code is
+		  tailored to specific input BPF object *FILE*, reflecting its
+		  structure by listing out available maps, program, variables,
+		  etc. Skeleton eliminates the need to lookup mentioned
+		  components by name. Instead, if skeleton instantiation
+		  succeeds, they are populated in skeleton structure as valid
+		  libbpf types (e.g., struct bpf_map pointer) and can be
+		  passed to existing generic libbpf APIs.
+
+		  In addition to simple and reliable access to maps and
+		  programs, skeleton provides a storage for BPF links (struct
+		  bpf_link) for each BPF program within BPF object. When
+		  requested, supported BPF programs will be automatically
+		  attached and resulting BPF links stored for further use by
+		  user in pre-allocated fields in skeleton struct. For BPF
+		  programs that can't be automatically attached by libbpf,
+		  user can attach them manually, but store resulting BPF link
+		  in per-program link field. All such set up links will be
+		  automatically destroyed on BPF skeleton destruction. This
+		  eliminates the need for users to manage links manually and
+		  rely on libbpf support to detach programs and free up
+		  resources.
+
+		  Another facility provided by BPF skeleton is an interface to
+		  global variables of all supported kinds: mutable, read-only,
+		  as well as extern ones. This interface allows to pre-setup
+		  initial values of variables before BPF object is loaded and
+		  verified by kernel. For non-read-only variables, the same
+		  interface can be used to fetch values of global variables on
+		  userspace side, even if they are modified by BPF code.
+
+		  During skeleton generation, contents of source BPF object
+		  *FILE* is embedded within generated code and is thus not
+		  necessary to keep around. This ensures skeleton and BPF
+		  object file are matching 1-to-1 and always stay in sync.
+		  Generated code is dual-licensed under LGPL-2.1 and
+		  BSD-2-Clause licenses.
+
+		  It is a design goal and guarantee that skeleton interfaces
+		  are interoperable with generic libbpf APIs. User should
+		  always be able to use skeleton API to create and load BPF
+		  object, and later use libbpf APIs to keep working with
+		  specific maps, programs, etc.
+
+		  As part of skeleton, few custom functions are generated.
+		  Each of them is prefixed with object name, derived from
+		  object file name. I.e., if BPF object file name is
+		  **example.o**, BPF object name will be **example**. The
+		  following custom functions are provided in such case:
+
+		  - **example__open** and **example__open_opts**.
+		    These functions are used to instantiate skeleton. It
+		    corresponds to libbpf's **bpf_object__open()** API.
+		    **_opts** variants accepts extra **bpf_object_open_opts**
+		    options.
+
+		  - **example__load**.
+		    This function creates maps, loads and verifies BPF
+		    programs, initializes global data maps. It corresponds to
+		    libppf's **bpf_object__load** API.
+
+		  - **example__open_and_load** combines **example__open** and
+		    **example__load** invocations in one commonly used
+		    operation.
+
+		  - **example__attach** and **example__detach**
+		    This pair of functions allow to attach and detach,
+		    correspondingly, already loaded BPF object. Only BPF
+		    programs of types supported by libbpf for auto-attachment
+		    will be auto-attached and their corresponding BPF links
+		    instantiated. For other BPF programs, user can manually
+		    create a BPF link and assign it to corresponding fields in
+		    skeleton struct. **example__detach** will detach both
+		    links created automatically, as well as those populated by
+		    user manually.
+
+		  - **example__destroy**
+		    Detach and unload BPF programs, free up all the resources
+		    used by skeleton and BPF object.
+
+		  If BPF object has global variables, corresponding structs
+		  with memory layout corresponding to global data data section
+		  layout will be created. Currently supported ones are: *.data*,
+		  *.bss*, *.rodata*, and *.kconfig* structs/data sections.
+		  These data sections/structs can be used to set up initial
+		  values of variables, if set before **example__load**.
+		  Afterwards, if target kernel supports memory-mapped BPF
+		  arrays, same structs can be used to fetch and update
+		  (non-read-only) data from userspace, with same simplicity
+		  as for BPF side.
+
+	**bpftool gen help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-V, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON,
+		  this option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+	-d, --debug
+		  Print all logs available from libbpf, including debug-level
+		  information.
+
+EXAMPLES
+========
+**$ cat example.c**
+::
+
+  #include <stdbool.h>
+  #include <linux/ptrace.h>
+  #include <linux/bpf.h>
+  #include "bpf_helpers.h"
+
+  const volatile int param1 = 42;
+  bool global_flag = true;
+  struct { int x; } data = {};
+
+  struct {
+  	__uint(type, BPF_MAP_TYPE_HASH);
+  	__uint(max_entries, 128);
+  	__type(key, int);
+  	__type(value, long);
+  } my_map SEC(".maps");
+
+  SEC("raw_tp/sys_enter")
+  int handle_sys_enter(struct pt_regs *ctx)
+  {
+  	static long my_static_var;
+  	if (global_flag)
+  		my_static_var++;
+  	else
+  		data.x += param1;
+  	return 0;
+  }
+
+  SEC("raw_tp/sys_exit")
+  int handle_sys_exit(struct pt_regs *ctx)
+  {
+  	int zero = 0;
+  	bpf_map_lookup_elem(&my_map, &zero);
+  	return 0;
+  }
+
+This is example BPF application with two BPF programs and a mix of BPF maps
+and global variables.
+
+**$ bpftool gen skeleton example.o**
+::
+
+  /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+  /* THIS FILE IS AUTOGENERATED! */
+  #ifndef __EXAMPLE_SKEL_H__
+  #define __EXAMPLE_SKEL_H__
+
+  #include <stdlib.h>
+  #include <libbpf.h>
+
+  struct example {
+  	struct bpf_object_skeleton *skeleton;
+  	struct bpf_object *obj;
+  	struct {
+  		struct bpf_map *rodata;
+  		struct bpf_map *data;
+  		struct bpf_map *bss;
+  		struct bpf_map *my_map;
+  	} maps;
+  	struct {
+  		struct bpf_program *handle_sys_enter;
+  		struct bpf_program *handle_sys_exit;
+  	} progs;
+  	struct {
+  		struct bpf_link *handle_sys_enter;
+  		struct bpf_link *handle_sys_exit;
+  	} links;
+  	struct example__bss {
+  		struct {
+  			int x;
+  		} data;
+  	} *bss;
+  	struct example__data {
+  		_Bool global_flag;
+  		long int handle_sys_enter_my_static_var;
+  	} *data;
+  	struct example__rodata {
+  		int param1;
+  	} *rodata;
+  };
+
+  static void example__destroy(struct example *obj);
+  static inline struct example *example__open_opts(
+                const struct bpf_object_open_opts *opts);
+  static inline struct example *example__open();
+  static inline int example__load(struct example *obj);
+  static inline struct example *example__open_and_load();
+  static inline int example__attach(struct example *obj);
+  static inline void example__detach(struct example *obj);
+
+  #endif /* __EXAMPLE_SKEL_H__ */
+
+**$ cat example_user.c**
+::
+
+  #include "example.skel.h"
+
+  int main()
+  {
+  	struct example *skel;
+  	int err = 0;
+
+  	skel = example__open();
+  	if (!skel)
+  		goto cleanup;
+
+  	skel->rodata->param1 = 128;
+
+  	err = example__load(skel);
+  	if (err)
+  		goto cleanup;
+
+  	err = example__attach(skel);
+  	if (err)
+  		goto cleanup;
+
+  	/* all libbpf APIs are usable */
+  	printf("my_map name: %s\n", bpf_map__name(skel->maps.my_map));
+  	printf("sys_enter prog FD: %d\n",
+  	       bpf_program__fd(skel->progs.handle_sys_enter));
+
+  	/* detach and re-attach sys_exit program */
+  	bpf_link__destroy(skel->links.handle_sys_exit);
+  	skel->links.handle_sys_exit =
+  		bpf_program__attach(skel->progs.handle_sys_exit);
+
+  	printf("my_static_var: %ld\n",
+  	       skel->bss->handle_sys_enter_my_static_var);
+
+  cleanup:
+  	example__destroy(skel);
+  	return err;
+  }
+
+**# ./example_user**
+::
+
+  my_map name: my_map
+  sys_enter prog FD: 8
+  my_static_var: 7
+
+This is a stripped-out version of skeleton generated for above example code.
+
+SEE ALSO
+========
+	**bpf**\ (2),
+	**bpf-helpers**\ (7),
+	**bpftool**\ (8),
+	**bpftool-map**\ (8),
+	**bpftool-prog**\ (8),
+	**bpftool-cgroup**\ (8),
+	**bpftool-feature**\ (8),
+	**bpftool-net**\ (8),
+	**bpftool-perf**\ (8),
+	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 1c0f7146aab0..cdeae8ae90ba 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -39,9 +39,9 @@ MAP COMMANDS
 |	**bpftool** **map freeze**     *MAP*
 |	**bpftool** **map help**
 |
-|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
+|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* }
 |	*DATA* := { [**hex**] *BYTES* }
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
 |	*VALUE* := { *DATA* | *MAP* | *PROG* }
 |	*UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
 |	*TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash**
@@ -55,8 +55,9 @@ DESCRIPTION
 ===========
 	**bpftool map { show | list }**   [*MAP*]
 		  Show information about loaded maps.  If *MAP* is specified
-		  show information only about given map, otherwise list all
-		  maps currently loaded on the system.
+		  show information only about given maps, otherwise list all
+		  maps currently loaded on the system.  In case of **name**,
+		  *MAP* may match several maps which will all be shown.
 
 		  Output will start with map ID followed by map type and
 		  zero or more named attributes (depending on kernel version).
@@ -66,7 +67,8 @@ DESCRIPTION
 		  as *FILE*.
 
 	**bpftool map dump**    *MAP*
-		  Dump all entries in a given *MAP*.
+		  Dump all entries in a given *MAP*.  In case of **name**,
+		  *MAP* may match several maps which will all be dumped.
 
 	**bpftool map update**  *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*]
 		  Update map entry for a given *KEY*.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 7a374b3c851d..64ddf8a4c518 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -33,7 +33,7 @@ PROG COMMANDS
 |	**bpftool** **prog help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
 |	*TYPE* := {
 |		**socket** | **kprobe** | **kretprobe** | **classifier** | **action** |
 |		**tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** |
@@ -53,8 +53,10 @@ DESCRIPTION
 ===========
 	**bpftool prog { show | list }** [*PROG*]
 		  Show information about loaded programs.  If *PROG* is
-		  specified show information only about given program, otherwise
-		  list all programs currently loaded on the system.
+		  specified show information only about given programs,
+		  otherwise list all programs currently loaded on the system.
+		  In case of **tag** or **name**, *PROG* may match several
+		  programs which will all be shown.
 
 		  Output will start with program ID followed by program type and
 		  zero or more named attributes (depending on kernel version).
@@ -68,11 +70,15 @@ DESCRIPTION
 		  performed via the **kernel.bpf_stats_enabled** sysctl knob.
 
 	**bpftool prog dump xlated** *PROG* [{ **file** *FILE* | **opcodes** | **visual** | **linum** }]
-		  Dump eBPF instructions of the program from the kernel. By
+		  Dump eBPF instructions of the programs from the kernel. By
 		  default, eBPF will be disassembled and printed to standard
 		  output in human-readable format. In this case, **opcodes**
 		  controls if raw opcodes should be printed as well.
 
+		  In case of **tag** or **name**, *PROG* may match several
+		  programs which will all be dumped.  However, if **file** or
+		  **visual** is specified, *PROG* must match a single program.
+
 		  If **file** is specified, the binary image will instead be
 		  written to *FILE*.
 
@@ -80,15 +86,17 @@ DESCRIPTION
 		  built instead, and eBPF instructions will be presented with
 		  CFG in DOT format, on standard output.
 
-		  If the prog has line_info available, the source line will
+		  If the programs have line_info available, the source line will
 		  be displayed by default.  If **linum** is specified,
 		  the filename, line number and line column will also be
 		  displayed on top of the source line.
 
 	**bpftool prog dump jited**  *PROG* [{ **file** *FILE* | **opcodes** | **linum** }]
 		  Dump jited image (host machine code) of the program.
+
 		  If *FILE* is specified image will be written to a file,
 		  otherwise it will be disassembled and printed to stdout.
+		  *PROG* must match a single program when **file** is specified.
 
 		  **opcodes** controls if raw opcodes will be printed.
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 6a9c52ef84a9..34239fda69ed 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -81,4 +81,5 @@ SEE ALSO
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
 	**bpftool-perf**\ (8),
-	**bpftool-btf**\ (8)
+	**bpftool-btf**\ (8),
+	**bpftool-gen**\ (8),
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 70493a6da206..754d8395e451 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -59,6 +59,21 @@ _bpftool_get_map_ids_for_type()
         command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
 }
 
+_bpftool_get_map_names()
+{
+    COMPREPLY+=( $( compgen -W "$( bpftool -jp map  2>&1 | \
+        command sed -n 's/.*"name": \(.*\),$/\1/p' )" -- "$cur" ) )
+}
+
+# Takes map type and adds matching map names to the list of suggestions.
+_bpftool_get_map_names_for_type()
+{
+    local type="$1"
+    COMPREPLY+=( $( compgen -W "$( bpftool -jp map  2>&1 | \
+        command grep -C2 "$type" | \
+        command sed -n 's/.*"name": \(.*\),$/\1/p' )" -- "$cur" ) )
+}
+
 _bpftool_get_prog_ids()
 {
     COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
@@ -71,6 +86,12 @@ _bpftool_get_prog_tags()
         command sed -n 's/.*"tag": "\(.*\)",$/\1/p' )" -- "$cur" ) )
 }
 
+_bpftool_get_prog_names()
+{
+    COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
+        command sed -n 's/.*"name": "\(.*\)",$/\1/p' )" -- "$cur" ) )
+}
+
 _bpftool_get_btf_ids()
 {
     COMPREPLY+=( $( compgen -W "$( bpftool -jp btf 2>&1 | \
@@ -180,6 +201,52 @@ _bpftool_map_update_get_id()
     esac
 }
 
+_bpftool_map_update_get_name()
+{
+    local command="$1"
+
+    # Is it the map to update, or a map to insert into the map to update?
+    # Search for "value" keyword.
+    local idx value
+    for (( idx=7; idx < ${#words[@]}-1; idx++ )); do
+        if [[ ${words[idx]} == "value" ]]; then
+            value=1
+            break
+        fi
+    done
+    if [[ $value -eq 0 ]]; then
+        case "$command" in
+            push)
+                _bpftool_get_map_names_for_type stack
+                ;;
+            enqueue)
+                _bpftool_get_map_names_for_type queue
+                ;;
+            *)
+                _bpftool_get_map_names
+                ;;
+        esac
+        return 0
+    fi
+
+    # Name to complete is for a value. It can be either prog name or map name. This
+    # depends on the type of the map to update.
+    local type=$(_bpftool_map_guess_map_type)
+    case $type in
+        array_of_maps|hash_of_maps)
+            _bpftool_get_map_names
+            return 0
+            ;;
+        prog_array)
+            _bpftool_get_prog_names
+            return 0
+            ;;
+        *)
+            return 0
+            ;;
+    esac
+}
+
 _bpftool()
 {
     local cur prev words objword
@@ -251,7 +318,8 @@ _bpftool()
     # Completion depends on object and command in use
     case $object in
         prog)
-            # Complete id, only for subcommands that use prog (but no map) ids
+            # Complete id and name, only for subcommands that use prog (but no
+            # map) ids/names.
             case $command in
                 show|list|dump|pin)
                     case $prev in
@@ -259,12 +327,16 @@ _bpftool()
                             _bpftool_get_prog_ids
                             return 0
                             ;;
+                        name)
+                            _bpftool_get_prog_names
+                            return 0
+                            ;;
                     esac
                     ;;
             esac
 
-            local PROG_TYPE='id pinned tag'
-            local MAP_TYPE='id pinned'
+            local PROG_TYPE='id pinned tag name'
+            local MAP_TYPE='id pinned name'
             case $command in
                 show|list)
                     [[ $prev != "$command" ]] && return 0
@@ -315,6 +387,9 @@ _bpftool()
                                 id)
                                     _bpftool_get_prog_ids
                                     ;;
+                                name)
+                                    _bpftool_get_map_names
+                                    ;;
                                 pinned)
                                     _filedir
                                     ;;
@@ -335,6 +410,9 @@ _bpftool()
                                 id)
                                     _bpftool_get_map_ids
                                     ;;
+                                name)
+                                    _bpftool_get_map_names
+                                    ;;
                                 pinned)
                                     _filedir
                                     ;;
@@ -399,6 +477,10 @@ _bpftool()
                             _bpftool_get_map_ids
                             return 0
                             ;;
+                        name)
+                            _bpftool_get_map_names
+                            return 0
+                            ;;
                         pinned|pinmaps)
                             _filedir
                             return 0
@@ -447,7 +529,7 @@ _bpftool()
             esac
             ;;
         map)
-            local MAP_TYPE='id pinned'
+            local MAP_TYPE='id pinned name'
             case $command in
                 show|list|dump|peek|pop|dequeue|freeze)
                     case $prev in
@@ -473,6 +555,24 @@ _bpftool()
                             esac
                             return 0
                             ;;
+                        name)
+                            case "$command" in
+                                peek)
+                                    _bpftool_get_map_names_for_type stack
+                                    _bpftool_get_map_names_for_type queue
+                                    ;;
+                                pop)
+                                    _bpftool_get_map_names_for_type stack
+                                    ;;
+                                dequeue)
+                                    _bpftool_get_map_names_for_type queue
+                                    ;;
+                                *)
+                                    _bpftool_get_map_names
+                                    ;;
+                            esac
+                            return 0
+                            ;;
                         *)
                             return 0
                             ;;
@@ -520,6 +620,10 @@ _bpftool()
                             _bpftool_get_map_ids
                             return 0
                             ;;
+                        name)
+                            _bpftool_get_map_names
+                            return 0
+                            ;;
                         key)
                             COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) )
                             ;;
@@ -545,6 +649,10 @@ _bpftool()
                             _bpftool_map_update_get_id $command
                             return 0
                             ;;
+                        name)
+                            _bpftool_map_update_get_name $command
+                            return 0
+                            ;;
                         key)
                             COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) )
                             ;;
@@ -553,13 +661,13 @@ _bpftool()
                             # map, depending on the type of the map to update.
                             case "$(_bpftool_map_guess_map_type)" in
                                 array_of_maps|hash_of_maps)
-                                    local MAP_TYPE='id pinned'
+                                    local MAP_TYPE='id pinned name'
                                     COMPREPLY+=( $( compgen -W "$MAP_TYPE" \
                                         -- "$cur" ) )
                                     return 0
                                     ;;
                                 prog_array)
-                                    local PROG_TYPE='id pinned tag'
+                                    local PROG_TYPE='id pinned tag name'
                                     COMPREPLY+=( $( compgen -W "$PROG_TYPE" \
                                         -- "$cur" ) )
                                     return 0
@@ -621,6 +729,10 @@ _bpftool()
                             _bpftool_get_map_ids_for_type perf_event_array
                             return 0
                             ;;
+                        name)
+                            _bpftool_get_map_names_for_type perf_event_array
+                            return 0
+                            ;;
                         cpu)
                             return 0
                             ;;
@@ -644,8 +756,8 @@ _bpftool()
             esac
             ;;
         btf)
-            local PROG_TYPE='id pinned tag'
-            local MAP_TYPE='id pinned'
+            local PROG_TYPE='id pinned tag name'
+            local MAP_TYPE='id pinned name'
             case $command in
                 dump)
                     case $prev in
@@ -676,6 +788,17 @@ _bpftool()
                             esac
                             return 0
                             ;;
+                        name)
+                            case $pprev in
+                                prog)
+                                    _bpftool_get_prog_names
+                                    ;;
+                                map)
+                                    _bpftool_get_map_names
+                                    ;;
+                            esac
+                            return 0
+                            ;;
                         format)
                             COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) )
                             ;;
@@ -716,6 +839,17 @@ _bpftool()
                     ;;
             esac
             ;;
+        gen)
+            case $command in
+                skeleton)
+                    _filedir
+		    ;;
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'skeleton help' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
         cgroup)
             case $command in
                 show|list|tree)
@@ -735,7 +869,7 @@ _bpftool()
                         connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \
                         getsockopt setsockopt'
                     local ATTACH_FLAGS='multi override'
-                    local PROG_TYPE='id pinned tag'
+                    local PROG_TYPE='id pinned tag name'
                     case $prev in
                         $command)
                             _filedir
@@ -760,7 +894,7 @@ _bpftool()
                             elif [[ "$command" == "attach" ]]; then
                                 # We have an attach type on the command line,
                                 # but it is not the previous word, or
-                                # "id|pinned|tag" (we already checked for
+                                # "id|pinned|tag|name" (we already checked for
                                 # that). This should only leave the case when
                                 # we need attach flags for "attach" commamnd.
                                 _bpftool_one_of_list "$ATTACH_FLAGS"
@@ -786,7 +920,7 @@ _bpftool()
             esac
             ;;
         net)
-            local PROG_TYPE='id pinned tag'
+            local PROG_TYPE='id pinned tag name'
             local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload'
             case $command in
                 show|list)
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 1ef45e55039e..2f017caa678d 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -117,6 +117,25 @@ static int count_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
 	return prog_cnt;
 }
 
+static int cgroup_has_attached_progs(int cgroup_fd)
+{
+	enum bpf_attach_type type;
+	bool no_prog = true;
+
+	for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) {
+		int count = count_attached_bpf_progs(cgroup_fd, type);
+
+		if (count < 0 && errno != EINVAL)
+			return -1;
+
+		if (count > 0) {
+			no_prog = false;
+			break;
+		}
+	}
+
+	return no_prog ? 0 : 1;
+}
 static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
 				   int level)
 {
@@ -161,6 +180,7 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
 static int do_show(int argc, char **argv)
 {
 	enum bpf_attach_type type;
+	int has_attached_progs;
 	const char *path;
 	int cgroup_fd;
 	int ret = -1;
@@ -192,6 +212,16 @@ static int do_show(int argc, char **argv)
 		goto exit;
 	}
 
+	has_attached_progs = cgroup_has_attached_progs(cgroup_fd);
+	if (has_attached_progs < 0) {
+		p_err("can't query bpf programs attached to %s: %s",
+		      path, strerror(errno));
+		goto exit_cgroup;
+	} else if (!has_attached_progs) {
+		ret = 0;
+		goto exit_cgroup;
+	}
+
 	if (json_output)
 		jsonw_start_array(json_wtr);
 	else
@@ -212,6 +242,7 @@ static int do_show(int argc, char **argv)
 	if (json_output)
 		jsonw_end_array(json_wtr);
 
+exit_cgroup:
 	close(cgroup_fd);
 exit:
 	return ret;
@@ -228,7 +259,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb,
 			   int typeflag, struct FTW *ftw)
 {
 	enum bpf_attach_type type;
-	bool skip = true;
+	int has_attached_progs;
 	int cgroup_fd;
 
 	if (typeflag != FTW_D)
@@ -240,22 +271,13 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb,
 		return SHOW_TREE_FN_ERR;
 	}
 
-	for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) {
-		int count = count_attached_bpf_progs(cgroup_fd, type);
-
-		if (count < 0 && errno != EINVAL) {
-			p_err("can't query bpf programs attached to %s: %s",
-			      fpath, strerror(errno));
-			close(cgroup_fd);
-			return SHOW_TREE_FN_ERR;
-		}
-		if (count > 0) {
-			skip = false;
-			break;
-		}
-	}
-
-	if (skip) {
+	has_attached_progs = cgroup_has_attached_progs(cgroup_fd);
+	if (has_attached_progs < 0) {
+		p_err("can't query bpf programs attached to %s: %s",
+		      fpath, strerror(errno));
+		close(cgroup_fd);
+		return SHOW_TREE_FN_ERR;
+	} else if (!has_attached_progs) {
 		close(cgroup_fd);
 		return 0;
 	}
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
new file mode 100644
index 000000000000..7ce09a9a6999
--- /dev/null
+++ b/tools/bpf/bpftool/gen.c
@@ -0,0 +1,609 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Facebook */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/err.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <bpf.h>
+#include <libbpf.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "btf.h"
+#include "libbpf_internal.h"
+#include "json_writer.h"
+#include "main.h"
+
+
+#define MAX_OBJ_NAME_LEN 64
+
+static void sanitize_identifier(char *name)
+{
+	int i;
+
+	for (i = 0; name[i]; i++)
+		if (!isalnum(name[i]) && name[i] != '_')
+			name[i] = '_';
+}
+
+static bool str_has_suffix(const char *str, const char *suffix)
+{
+	size_t i, n1 = strlen(str), n2 = strlen(suffix);
+
+	if (n1 < n2)
+		return false;
+
+	for (i = 0; i < n2; i++) {
+		if (str[n1 - i - 1] != suffix[n2 - i - 1])
+			return false;
+	}
+
+	return true;
+}
+
+static void get_obj_name(char *name, const char *file)
+{
+	/* Using basename() GNU version which doesn't modify arg. */
+	strncpy(name, basename(file), MAX_OBJ_NAME_LEN - 1);
+	name[MAX_OBJ_NAME_LEN - 1] = '\0';
+	if (str_has_suffix(name, ".o"))
+		name[strlen(name) - 2] = '\0';
+	sanitize_identifier(name);
+}
+
+static void get_header_guard(char *guard, const char *obj_name)
+{
+	int i;
+
+	sprintf(guard, "__%s_SKEL_H__", obj_name);
+	for (i = 0; guard[i]; i++)
+		guard[i] = toupper(guard[i]);
+}
+
+static const char *get_map_ident(const struct bpf_map *map)
+{
+	const char *name = bpf_map__name(map);
+
+	if (!bpf_map__is_internal(map))
+		return name;
+
+	if (str_has_suffix(name, ".data"))
+		return "data";
+	else if (str_has_suffix(name, ".rodata"))
+		return "rodata";
+	else if (str_has_suffix(name, ".bss"))
+		return "bss";
+	else if (str_has_suffix(name, ".kconfig"))
+		return "kconfig";
+	else
+		return NULL;
+}
+
+static void codegen_btf_dump_printf(void *ct, const char *fmt, va_list args)
+{
+	vprintf(fmt, args);
+}
+
+static int codegen_datasec_def(struct bpf_object *obj,
+			       struct btf *btf,
+			       struct btf_dump *d,
+			       const struct btf_type *sec,
+			       const char *obj_name)
+{
+	const char *sec_name = btf__name_by_offset(btf, sec->name_off);
+	const struct btf_var_secinfo *sec_var = btf_var_secinfos(sec);
+	int i, err, off = 0, pad_cnt = 0, vlen = btf_vlen(sec);
+	const char *sec_ident;
+	char var_ident[256];
+
+	if (strcmp(sec_name, ".data") == 0)
+		sec_ident = "data";
+	else if (strcmp(sec_name, ".bss") == 0)
+		sec_ident = "bss";
+	else if (strcmp(sec_name, ".rodata") == 0)
+		sec_ident = "rodata";
+	else if (strcmp(sec_name, ".kconfig") == 0)
+		sec_ident = "kconfig";
+	else
+		return 0;
+
+	printf("	struct %s__%s {\n", obj_name, sec_ident);
+	for (i = 0; i < vlen; i++, sec_var++) {
+		const struct btf_type *var = btf__type_by_id(btf, sec_var->type);
+		const char *var_name = btf__name_by_offset(btf, var->name_off);
+		DECLARE_LIBBPF_OPTS(btf_dump_emit_type_decl_opts, opts,
+			.field_name = var_ident,
+			.indent_level = 2,
+		);
+		int need_off = sec_var->offset, align_off, align;
+		__u32 var_type_id = var->type;
+		const struct btf_type *t;
+
+		t = btf__type_by_id(btf, var_type_id);
+		while (btf_is_mod(t)) {
+			var_type_id = t->type;
+			t = btf__type_by_id(btf, var_type_id);
+		}
+
+		if (off > need_off) {
+			p_err("Something is wrong for %s's variable #%d: need offset %d, already at %d.\n",
+			      sec_name, i, need_off, off);
+			return -EINVAL;
+		}
+
+		align = btf__align_of(btf, var->type);
+		if (align <= 0) {
+			p_err("Failed to determine alignment of variable '%s': %d",
+			      var_name, align);
+			return -EINVAL;
+		}
+
+		align_off = (off + align - 1) / align * align;
+		if (align_off != need_off) {
+			printf("\t\tchar __pad%d[%d];\n",
+			       pad_cnt, need_off - off);
+			pad_cnt++;
+		}
+
+		/* sanitize variable name, e.g., for static vars inside
+		 * a function, it's name is '<function name>.<variable name>',
+		 * which we'll turn into a '<function name>_<variable name>'
+		 */
+		var_ident[0] = '\0';
+		strncat(var_ident, var_name, sizeof(var_ident) - 1);
+		sanitize_identifier(var_ident);
+
+		printf("\t\t");
+		err = btf_dump__emit_type_decl(d, var_type_id, &opts);
+		if (err)
+			return err;
+		printf(";\n");
+
+		off = sec_var->offset + sec_var->size;
+	}
+	printf("	} *%s;\n", sec_ident);
+	return 0;
+}
+
+static int codegen_datasecs(struct bpf_object *obj, const char *obj_name)
+{
+	struct btf *btf = bpf_object__btf(obj);
+	int n = btf__get_nr_types(btf);
+	struct btf_dump *d;
+	int i, err = 0;
+
+	d = btf_dump__new(btf, NULL, NULL, codegen_btf_dump_printf);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+
+	for (i = 1; i <= n; i++) {
+		const struct btf_type *t = btf__type_by_id(btf, i);
+
+		if (!btf_is_datasec(t))
+			continue;
+
+		err = codegen_datasec_def(obj, btf, d, t, obj_name);
+		if (err)
+			goto out;
+	}
+out:
+	btf_dump__free(d);
+	return err;
+}
+
+static int codegen(const char *template, ...)
+{
+	const char *src, *end;
+	int skip_tabs = 0, n;
+	char *s, *dst;
+	va_list args;
+	char c;
+
+	n = strlen(template);
+	s = malloc(n + 1);
+	if (!s)
+		return -ENOMEM;
+	src = template;
+	dst = s;
+
+	/* find out "baseline" indentation to skip */
+	while ((c = *src++)) {
+		if (c == '\t') {
+			skip_tabs++;
+		} else if (c == '\n') {
+			break;
+		} else {
+			p_err("unrecognized character at pos %td in template '%s'",
+			      src - template - 1, template);
+			return -EINVAL;
+		}
+	}
+
+	while (*src) {
+		/* skip baseline indentation tabs */
+		for (n = skip_tabs; n > 0; n--, src++) {
+			if (*src != '\t') {
+				p_err("not enough tabs at pos %td in template '%s'",
+				      src - template - 1, template);
+				return -EINVAL;
+			}
+		}
+		/* trim trailing whitespace */
+		end = strchrnul(src, '\n');
+		for (n = end - src; n > 0 && isspace(src[n - 1]); n--)
+			;
+		memcpy(dst, src, n);
+		dst += n;
+		if (*end)
+			*dst++ = '\n';
+		src = *end ? end + 1 : end;
+	}
+	*dst++ = '\0';
+
+	/* print out using adjusted template */
+	va_start(args, template);
+	n = vprintf(s, args);
+	va_end(args);
+
+	free(s);
+	return n;
+}
+
+static int do_skeleton(int argc, char **argv)
+{
+	char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")];
+	size_t i, map_cnt = 0, prog_cnt = 0, file_sz, mmap_sz;
+	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts);
+	char obj_name[MAX_OBJ_NAME_LEN], *obj_data;
+	struct bpf_object *obj = NULL;
+	const char *file, *ident;
+	struct bpf_program *prog;
+	int fd, len, err = -1;
+	struct bpf_map *map;
+	struct btf *btf;
+	struct stat st;
+
+	if (!REQ_ARGS(1)) {
+		usage();
+		return -1;
+	}
+	file = GET_ARG();
+
+	if (argc) {
+		p_err("extra unknown arguments");
+		return -1;
+	}
+
+	if (stat(file, &st)) {
+		p_err("failed to stat() %s: %s", file, strerror(errno));
+		return -1;
+	}
+	file_sz = st.st_size;
+	mmap_sz = roundup(file_sz, sysconf(_SC_PAGE_SIZE));
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		p_err("failed to open() %s: %s", file, strerror(errno));
+		return -1;
+	}
+	obj_data = mmap(NULL, mmap_sz, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (obj_data == MAP_FAILED) {
+		obj_data = NULL;
+		p_err("failed to mmap() %s: %s", file, strerror(errno));
+		goto out;
+	}
+	get_obj_name(obj_name, file);
+	opts.object_name = obj_name;
+	obj = bpf_object__open_mem(obj_data, file_sz, &opts);
+	if (IS_ERR(obj)) {
+		obj = NULL;
+		p_err("failed to open BPF object file: %ld", PTR_ERR(obj));
+		goto out;
+	}
+
+	bpf_object__for_each_map(map, obj) {
+		ident = get_map_ident(map);
+		if (!ident) {
+			p_err("ignoring unrecognized internal map '%s'...",
+			      bpf_map__name(map));
+			continue;
+		}
+		map_cnt++;
+	}
+	bpf_object__for_each_program(prog, obj) {
+		prog_cnt++;
+	}
+
+	get_header_guard(header_guard, obj_name);
+	codegen("\
+		\n\
+		/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */   \n\
+									    \n\
+		/* THIS FILE IS AUTOGENERATED! */			    \n\
+		#ifndef %2$s						    \n\
+		#define %2$s						    \n\
+									    \n\
+		#include <stdlib.h>					    \n\
+		#include <libbpf.h>					    \n\
+									    \n\
+		struct %1$s {						    \n\
+			struct bpf_object_skeleton *skeleton;		    \n\
+			struct bpf_object *obj;				    \n\
+		",
+		obj_name, header_guard
+	);
+
+	if (map_cnt) {
+		printf("\tstruct {\n");
+		bpf_object__for_each_map(map, obj) {
+			ident = get_map_ident(map);
+			if (!ident)
+				continue;
+			printf("\t\tstruct bpf_map *%s;\n", ident);
+		}
+		printf("\t} maps;\n");
+	}
+
+	if (prog_cnt) {
+		printf("\tstruct {\n");
+		bpf_object__for_each_program(prog, obj) {
+			printf("\t\tstruct bpf_program *%s;\n",
+			       bpf_program__name(prog));
+		}
+		printf("\t} progs;\n");
+		printf("\tstruct {\n");
+		bpf_object__for_each_program(prog, obj) {
+			printf("\t\tstruct bpf_link *%s;\n",
+			       bpf_program__name(prog));
+		}
+		printf("\t} links;\n");
+	}
+
+	btf = bpf_object__btf(obj);
+	if (btf) {
+		err = codegen_datasecs(obj, obj_name);
+		if (err)
+			goto out;
+	}
+
+	codegen("\
+		\n\
+		};							    \n\
+									    \n\
+		static void						    \n\
+		%1$s__destroy(struct %1$s *obj)				    \n\
+		{							    \n\
+			if (!obj)					    \n\
+				return;					    \n\
+			if (obj->skeleton)				    \n\
+				bpf_object__destroy_skeleton(obj->skeleton);\n\
+			free(obj);					    \n\
+		}							    \n\
+									    \n\
+		static inline int					    \n\
+		%1$s__create_skeleton(struct %1$s *obj);		    \n\
+									    \n\
+		static inline struct %1$s *				    \n\
+		%1$s__open_opts(const struct bpf_object_open_opts *opts)    \n\
+		{							    \n\
+			struct %1$s *obj;				    \n\
+									    \n\
+			obj = (typeof(obj))calloc(1, sizeof(*obj));	    \n\
+			if (!obj)					    \n\
+				return NULL;				    \n\
+			if (%1$s__create_skeleton(obj))			    \n\
+				goto err;				    \n\
+			if (bpf_object__open_skeleton(obj->skeleton, opts)) \n\
+				goto err;				    \n\
+									    \n\
+			return obj;					    \n\
+		err:							    \n\
+			%1$s__destroy(obj);				    \n\
+			return NULL;					    \n\
+		}							    \n\
+									    \n\
+		static inline struct %1$s *				    \n\
+		%1$s__open(void)					    \n\
+		{							    \n\
+			return %1$s__open_opts(NULL);			    \n\
+		}							    \n\
+									    \n\
+		static inline int					    \n\
+		%1$s__load(struct %1$s *obj)				    \n\
+		{							    \n\
+			return bpf_object__load_skeleton(obj->skeleton);    \n\
+		}							    \n\
+									    \n\
+		static inline struct %1$s *				    \n\
+		%1$s__open_and_load(void)				    \n\
+		{							    \n\
+			struct %1$s *obj;				    \n\
+									    \n\
+			obj = %1$s__open();				    \n\
+			if (!obj)					    \n\
+				return NULL;				    \n\
+			if (%1$s__load(obj)) {				    \n\
+				%1$s__destroy(obj);			    \n\
+				return NULL;				    \n\
+			}						    \n\
+			return obj;					    \n\
+		}							    \n\
+									    \n\
+		static inline int					    \n\
+		%1$s__attach(struct %1$s *obj)				    \n\
+		{							    \n\
+			return bpf_object__attach_skeleton(obj->skeleton);  \n\
+		}							    \n\
+									    \n\
+		static inline void					    \n\
+		%1$s__detach(struct %1$s *obj)				    \n\
+		{							    \n\
+			return bpf_object__detach_skeleton(obj->skeleton);  \n\
+		}							    \n\
+		",
+		obj_name
+	);
+
+	codegen("\
+		\n\
+									    \n\
+		static inline int					    \n\
+		%1$s__create_skeleton(struct %1$s *obj)			    \n\
+		{							    \n\
+			struct bpf_object_skeleton *s;			    \n\
+									    \n\
+			s = (typeof(s))calloc(1, sizeof(*s));		    \n\
+			if (!s)						    \n\
+				return -1;				    \n\
+			obj->skeleton = s;				    \n\
+									    \n\
+			s->sz = sizeof(*s);				    \n\
+			s->name = \"%1$s\";				    \n\
+			s->obj = &obj->obj;				    \n\
+		",
+		obj_name
+	);
+	if (map_cnt) {
+		codegen("\
+			\n\
+									    \n\
+				/* maps */				    \n\
+				s->map_cnt = %zu;			    \n\
+				s->map_skel_sz = sizeof(*s->maps);	    \n\
+				s->maps = (typeof(s->maps))calloc(s->map_cnt, s->map_skel_sz);\n\
+				if (!s->maps)				    \n\
+					goto err;			    \n\
+			",
+			map_cnt
+		);
+		i = 0;
+		bpf_object__for_each_map(map, obj) {
+			ident = get_map_ident(map);
+
+			if (!ident)
+				continue;
+
+			codegen("\
+				\n\
+									    \n\
+					s->maps[%zu].name = \"%s\";	    \n\
+					s->maps[%zu].map = &obj->maps.%s;   \n\
+				",
+				i, bpf_map__name(map), i, ident);
+			/* memory-mapped internal maps */
+			if (bpf_map__is_internal(map) &&
+			    (bpf_map__def(map)->map_flags & BPF_F_MMAPABLE)) {
+				printf("\ts->maps[%zu].mmaped = (void **)&obj->%s;\n",
+				       i, ident);
+			}
+			i++;
+		}
+	}
+	if (prog_cnt) {
+		codegen("\
+			\n\
+									    \n\
+				/* programs */				    \n\
+				s->prog_cnt = %zu;			    \n\
+				s->prog_skel_sz = sizeof(*s->progs);	    \n\
+				s->progs = (typeof(s->progs))calloc(s->prog_cnt, s->prog_skel_sz);\n\
+				if (!s->progs)				    \n\
+					goto err;			    \n\
+			",
+			prog_cnt
+		);
+		i = 0;
+		bpf_object__for_each_program(prog, obj) {
+			codegen("\
+				\n\
+									    \n\
+					s->progs[%1$zu].name = \"%2$s\";    \n\
+					s->progs[%1$zu].prog = &obj->progs.%2$s;\n\
+					s->progs[%1$zu].link = &obj->links.%2$s;\n\
+				",
+				i, bpf_program__name(prog));
+			i++;
+		}
+	}
+	codegen("\
+		\n\
+									    \n\
+			s->data_sz = %d;				    \n\
+			s->data = (void *)\"\\				    \n\
+		",
+		file_sz);
+
+	/* embed contents of BPF object file */
+	for (i = 0, len = 0; i < file_sz; i++) {
+		int w = obj_data[i] ? 4 : 2;
+
+		len += w;
+		if (len > 78) {
+			printf("\\\n");
+			len = w;
+		}
+		if (!obj_data[i])
+			printf("\\0");
+		else
+			printf("\\x%02x", (unsigned char)obj_data[i]);
+	}
+
+	codegen("\
+		\n\
+		\";							    \n\
+									    \n\
+			return 0;					    \n\
+		err:							    \n\
+			bpf_object__destroy_skeleton(s);		    \n\
+			return -1;					    \n\
+		}							    \n\
+									    \n\
+		#endif /* %s */						    \n\
+		",
+		header_guard);
+	err = 0;
+out:
+	bpf_object__close(obj);
+	if (obj_data)
+		munmap(obj_data, mmap_sz);
+	close(fd);
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	if (json_output) {
+		jsonw_null(json_wtr);
+		return 0;
+	}
+
+	fprintf(stderr,
+		"Usage: %1$s gen skeleton FILE\n"
+		"       %1$s gen help\n"
+		"\n"
+		"       " HELP_SPEC_OPTIONS "\n"
+		"",
+		bin_name);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "skeleton",	do_skeleton },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_gen(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 4764581ff9ea..1fe91c558508 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -58,7 +58,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n"
+		"       OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -227,6 +227,7 @@ static const struct cmd cmds[] = {
 	{ "net",	do_net },
 	{ "feature",	do_feature },
 	{ "btf",	do_btf },
+	{ "gen",	do_gen },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 2899095f8254..4e75b58d3989 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -42,12 +42,12 @@
 #define BPF_TAG_FMT	"%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx"
 
 #define HELP_SPEC_PROGRAM						\
-	"PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }"
+	"PROG := { id PROG_ID | pinned FILE | tag PROG_TAG | name PROG_NAME }"
 #define HELP_SPEC_OPTIONS						\
 	"OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} |\n"	\
 	"\t            {-m|--mapcompat} | {-n|--nomount} }"
 #define HELP_SPEC_MAP							\
-	"MAP := { id MAP_ID | pinned FILE }"
+	"MAP := { id MAP_ID | pinned FILE | name MAP_NAME }"
 
 static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_UNSPEC]			= "unspec",
@@ -155,6 +155,7 @@ int do_net(int argc, char **arg);
 int do_tracelog(int argc, char **arg);
 int do_feature(int argc, char **argv);
 int do_btf(int argc, char **argv);
+int do_gen(int argc, char **argv);
 
 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
 int prog_parse_fd(int *argc, char ***argv);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index de61d73b9030..c01f76fa6876 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -91,10 +91,66 @@ static void *alloc_value(struct bpf_map_info *info)
 		return malloc(info->value_size);
 }
 
-int map_parse_fd(int *argc, char ***argv)
+static int map_fd_by_name(char *name, int **fds)
 {
-	int fd;
+	unsigned int id = 0;
+	int fd, nb_fds = 0;
+	void *tmp;
+	int err;
+
+	while (true) {
+		struct bpf_map_info info = {};
+		__u32 len = sizeof(info);
+
+		err = bpf_map_get_next_id(id, &id);
+		if (err) {
+			if (errno != ENOENT) {
+				p_err("%s", strerror(errno));
+				goto err_close_fds;
+			}
+			return nb_fds;
+		}
+
+		fd = bpf_map_get_fd_by_id(id);
+		if (fd < 0) {
+			p_err("can't get map by id (%u): %s",
+			      id, strerror(errno));
+			goto err_close_fds;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		if (err) {
+			p_err("can't get map info (%u): %s",
+			      id, strerror(errno));
+			goto err_close_fd;
+		}
+
+		if (strncmp(name, info.name, BPF_OBJ_NAME_LEN)) {
+			close(fd);
+			continue;
+		}
+
+		if (nb_fds > 0) {
+			tmp = realloc(*fds, (nb_fds + 1) * sizeof(int));
+			if (!tmp) {
+				p_err("failed to realloc");
+				goto err_close_fd;
+			}
+			*fds = tmp;
+		}
+		(*fds)[nb_fds++] = fd;
+	}
+
+err_close_fd:
+	close(fd);
+err_close_fds:
+	while (--nb_fds >= 0)
+		close((*fds)[nb_fds]);
+	return -1;
+}
 
+static int map_parse_fds(int *argc, char ***argv, int **fds)
+{
 	if (is_prefix(**argv, "id")) {
 		unsigned int id;
 		char *endptr;
@@ -108,10 +164,25 @@ int map_parse_fd(int *argc, char ***argv)
 		}
 		NEXT_ARGP();
 
-		fd = bpf_map_get_fd_by_id(id);
-		if (fd < 0)
+		(*fds)[0] = bpf_map_get_fd_by_id(id);
+		if ((*fds)[0] < 0) {
 			p_err("get map by id (%u): %s", id, strerror(errno));
-		return fd;
+			return -1;
+		}
+		return 1;
+	} else if (is_prefix(**argv, "name")) {
+		char *name;
+
+		NEXT_ARGP();
+
+		name = **argv;
+		if (strlen(name) > BPF_OBJ_NAME_LEN - 1) {
+			p_err("can't parse name");
+			return -1;
+		}
+		NEXT_ARGP();
+
+		return map_fd_by_name(name, fds);
 	} else if (is_prefix(**argv, "pinned")) {
 		char *path;
 
@@ -120,13 +191,43 @@ int map_parse_fd(int *argc, char ***argv)
 		path = **argv;
 		NEXT_ARGP();
 
-		return open_obj_pinned_any(path, BPF_OBJ_MAP);
+		(*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_MAP);
+		if ((*fds)[0] < 0)
+			return -1;
+		return 1;
 	}
 
-	p_err("expected 'id' or 'pinned', got: '%s'?", **argv);
+	p_err("expected 'id', 'name' or 'pinned', got: '%s'?", **argv);
 	return -1;
 }
 
+int map_parse_fd(int *argc, char ***argv)
+{
+	int *fds = NULL;
+	int nb_fds, fd;
+
+	fds = malloc(sizeof(int));
+	if (!fds) {
+		p_err("mem alloc failed");
+		return -1;
+	}
+	nb_fds = map_parse_fds(argc, argv, &fds);
+	if (nb_fds != 1) {
+		if (nb_fds > 1) {
+			p_err("several maps match this handle");
+			while (nb_fds--)
+				close(fds[nb_fds]);
+		}
+		fd = -1;
+		goto exit_free;
+	}
+
+	fd = fds[0];
+exit_free:
+	free(fds);
+	return fd;
+}
+
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
 {
 	int err;
@@ -479,6 +580,21 @@ static int parse_elem(char **argv, struct bpf_map_info *info,
 	return -1;
 }
 
+static void show_map_header_json(struct bpf_map_info *info, json_writer_t *wtr)
+{
+	jsonw_uint_field(wtr, "id", info->id);
+	if (info->type < ARRAY_SIZE(map_type_name))
+		jsonw_string_field(wtr, "type", map_type_name[info->type]);
+	else
+		jsonw_uint_field(wtr, "type", info->type);
+
+	if (*info->name)
+		jsonw_string_field(wtr, "name", info->name);
+
+	jsonw_name(wtr, "flags");
+	jsonw_printf(wtr, "%d", info->map_flags);
+}
+
 static int show_map_close_json(int fd, struct bpf_map_info *info)
 {
 	char *memlock, *frozen_str;
@@ -489,18 +605,7 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
 
 	jsonw_start_object(json_wtr);
 
-	jsonw_uint_field(json_wtr, "id", info->id);
-	if (info->type < ARRAY_SIZE(map_type_name))
-		jsonw_string_field(json_wtr, "type",
-				   map_type_name[info->type]);
-	else
-		jsonw_uint_field(json_wtr, "type", info->type);
-
-	if (*info->name)
-		jsonw_string_field(json_wtr, "name", info->name);
-
-	jsonw_name(json_wtr, "flags");
-	jsonw_printf(json_wtr, "%d", info->map_flags);
+	show_map_header_json(info, json_wtr);
 
 	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
@@ -561,14 +666,8 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
 	return 0;
 }
 
-static int show_map_close_plain(int fd, struct bpf_map_info *info)
+static void show_map_header_plain(struct bpf_map_info *info)
 {
-	char *memlock, *frozen_str;
-	int frozen = 0;
-
-	memlock = get_fdinfo(fd, "memlock");
-	frozen_str = get_fdinfo(fd, "frozen");
-
 	printf("%u: ", info->id);
 	if (info->type < ARRAY_SIZE(map_type_name))
 		printf("%s  ", map_type_name[info->type]);
@@ -581,6 +680,17 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
 	printf("flags 0x%x", info->map_flags);
 	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
 	printf("\n");
+}
+
+static int show_map_close_plain(int fd, struct bpf_map_info *info)
+{
+	char *memlock, *frozen_str;
+	int frozen = 0;
+
+	memlock = get_fdinfo(fd, "memlock");
+	frozen_str = get_fdinfo(fd, "frozen");
+
+	show_map_header_plain(info);
 	printf("\tkey %uB  value %uB  max_entries %u",
 	       info->key_size, info->value_size, info->max_entries);
 
@@ -642,6 +752,50 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
 	return 0;
 }
 
+static int do_show_subset(int argc, char **argv)
+{
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	int *fds = NULL;
+	int nb_fds, i;
+	int err = -1;
+
+	fds = malloc(sizeof(int));
+	if (!fds) {
+		p_err("mem alloc failed");
+		return -1;
+	}
+	nb_fds = map_parse_fds(&argc, &argv, &fds);
+	if (nb_fds < 1)
+		goto exit_free;
+
+	if (json_output && nb_fds > 1)
+		jsonw_start_array(json_wtr);	/* root array */
+	for (i = 0; i < nb_fds; i++) {
+		err = bpf_obj_get_info_by_fd(fds[i], &info, &len);
+		if (err) {
+			p_err("can't get map info: %s",
+			      strerror(errno));
+			for (; i < nb_fds; i++)
+				close(fds[i]);
+			break;
+		}
+
+		if (json_output)
+			show_map_close_json(fds[i], &info);
+		else
+			show_map_close_plain(fds[i], &info);
+
+		close(fds[i]);
+	}
+	if (json_output && nb_fds > 1)
+		jsonw_end_array(json_wtr);	/* root array */
+
+exit_free:
+	free(fds);
+	return err;
+}
+
 static int do_show(int argc, char **argv)
 {
 	struct bpf_map_info info = {};
@@ -653,16 +807,8 @@ static int do_show(int argc, char **argv)
 	if (show_pinned)
 		build_pinned_obj_table(&map_table, BPF_OBJ_MAP);
 
-	if (argc == 2) {
-		fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
-		if (fd < 0)
-			return -1;
-
-		if (json_output)
-			return show_map_close_json(fd, &info);
-		else
-			return show_map_close_plain(fd, &info);
-	}
+	if (argc == 2)
+		return do_show_subset(argc, argv);
 
 	if (argc)
 		return BAD_ARG();
@@ -765,26 +911,49 @@ static int dump_map_elem(int fd, void *key, void *value,
 	return 0;
 }
 
-static int do_dump(int argc, char **argv)
+static int maps_have_btf(int *fds, int nb_fds)
 {
 	struct bpf_map_info info = {};
-	void *key, *value, *prev_key;
-	unsigned int num_elems = 0;
 	__u32 len = sizeof(info);
-	json_writer_t *btf_wtr;
 	struct btf *btf = NULL;
-	int err;
-	int fd;
+	int err, i;
 
-	if (argc != 2)
-		usage();
+	for (i = 0; i < nb_fds; i++) {
+		err = bpf_obj_get_info_by_fd(fds[i], &info, &len);
+		if (err) {
+			p_err("can't get map info: %s", strerror(errno));
+			goto err_close;
+		}
 
-	fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
-	if (fd < 0)
-		return -1;
+		err = btf__get_from_id(info.btf_id, &btf);
+		if (err) {
+			p_err("failed to get btf");
+			goto err_close;
+		}
 
-	key = malloc(info.key_size);
-	value = alloc_value(&info);
+		if (!btf)
+			return 0;
+	}
+
+	return 1;
+
+err_close:
+	for (; i < nb_fds; i++)
+		close(fds[i]);
+	return -1;
+}
+
+static int
+map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr,
+	 bool enable_btf, bool show_header)
+{
+	void *key, *value, *prev_key;
+	unsigned int num_elems = 0;
+	struct btf *btf = NULL;
+	int err;
+
+	key = malloc(info->key_size);
+	value = alloc_value(info);
 	if (!key || !value) {
 		p_err("mem alloc failed");
 		err = -1;
@@ -793,30 +962,32 @@ static int do_dump(int argc, char **argv)
 
 	prev_key = NULL;
 
-	err = btf__get_from_id(info.btf_id, &btf);
-	if (err) {
-		p_err("failed to get btf");
-		goto exit_free;
+	if (enable_btf) {
+		err = btf__get_from_id(info->btf_id, &btf);
+		if (err || !btf) {
+			/* enable_btf is true only if we've already checked
+			 * that all maps have BTF information.
+			 */
+			p_err("failed to get btf");
+			goto exit_free;
+		}
 	}
 
-	if (json_output)
-		jsonw_start_array(json_wtr);
-	else
-		if (btf) {
-			btf_wtr = get_btf_writer();
-			if (!btf_wtr) {
-				p_info("failed to create json writer for btf. falling back to plain output");
-				btf__free(btf);
-				btf = NULL;
-			} else {
-				jsonw_start_array(btf_wtr);
-			}
+	if (wtr) {
+		if (show_header) {
+			jsonw_start_object(wtr);	/* map object */
+			show_map_header_json(info, wtr);
+			jsonw_name(wtr, "elements");
 		}
+		jsonw_start_array(wtr);		/* elements */
+	} else if (show_header) {
+		show_map_header_plain(info);
+	}
 
-	if (info.type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
-	    info.value_size != 8)
+	if (info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
+	    info->value_size != 8)
 		p_info("Warning: cannot read values from %s map with value_size != 8",
-		       map_type_name[info.type]);
+		       map_type_name[info->type]);
 	while (true) {
 		err = bpf_map_get_next_key(fd, prev_key, key);
 		if (err) {
@@ -824,15 +995,14 @@ static int do_dump(int argc, char **argv)
 				err = 0;
 			break;
 		}
-		num_elems += dump_map_elem(fd, key, value, &info, btf, btf_wtr);
+		num_elems += dump_map_elem(fd, key, value, info, btf, wtr);
 		prev_key = key;
 	}
 
-	if (json_output)
-		jsonw_end_array(json_wtr);
-	else if (btf) {
-		jsonw_end_array(btf_wtr);
-		jsonw_destroy(&btf_wtr);
+	if (wtr) {
+		jsonw_end_array(wtr);	/* elements */
+		if (show_header)
+			jsonw_end_object(wtr);	/* map object */
 	} else {
 		printf("Found %u element%s\n", num_elems,
 		       num_elems != 1 ? "s" : "");
@@ -847,6 +1017,72 @@ exit_free:
 	return err;
 }
 
+static int do_dump(int argc, char **argv)
+{
+	json_writer_t *wtr = NULL, *btf_wtr = NULL;
+	struct bpf_map_info info = {};
+	int nb_fds, i = 0, btf = 0;
+	__u32 len = sizeof(info);
+	int *fds = NULL;
+	int err = -1;
+
+	if (argc != 2)
+		usage();
+
+	fds = malloc(sizeof(int));
+	if (!fds) {
+		p_err("mem alloc failed");
+		return -1;
+	}
+	nb_fds = map_parse_fds(&argc, &argv, &fds);
+	if (nb_fds < 1)
+		goto exit_free;
+
+	if (json_output) {
+		wtr = json_wtr;
+	} else {
+		btf = maps_have_btf(fds, nb_fds);
+		if (btf < 0)
+			goto exit_close;
+		if (btf) {
+			btf_wtr = get_btf_writer();
+			if (btf_wtr) {
+				wtr = btf_wtr;
+			} else {
+				p_info("failed to create json writer for btf. falling back to plain output");
+				btf = 0;
+			}
+		}
+	}
+
+	if (wtr && nb_fds > 1)
+		jsonw_start_array(wtr);	/* root array */
+	for (i = 0; i < nb_fds; i++) {
+		if (bpf_obj_get_info_by_fd(fds[i], &info, &len)) {
+			p_err("can't get map info: %s", strerror(errno));
+			break;
+		}
+		err = map_dump(fds[i], &info, wtr, btf, nb_fds > 1);
+		if (!wtr && i != nb_fds - 1)
+			printf("\n");
+
+		if (err)
+			break;
+		close(fds[i]);
+	}
+	if (wtr && nb_fds > 1)
+		jsonw_end_array(wtr);	/* root array */
+
+	if (btf)
+		jsonw_destroy(&btf_wtr);
+exit_close:
+	for (; i < nb_fds; i++)
+		close(fds[i]);
+exit_free:
+	free(fds);
+	return err;
+}
+
 static int alloc_key_value(struct bpf_map_info *info, void **key, void **value)
 {
 	*key = NULL;
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 4f52d3151616..d93bee298e54 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -18,6 +18,7 @@
 
 #include <bpf.h>
 #include <nlattr.h>
+#include "libbpf_internal.h"
 #include "main.h"
 #include "netlink_dumper.h"
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 2ce9c5ba1934..47a61ac42dc0 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -25,6 +25,11 @@
 #include "main.h"
 #include "xlated_dumper.h"
 
+enum dump_mode {
+	DUMP_JITED,
+	DUMP_XLATED,
+};
+
 static const char * const attach_type_strings[] = {
 	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
@@ -77,11 +82,12 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
 		strftime(buf, size, "%FT%T%z", &load_tm);
 }
 
-static int prog_fd_by_tag(unsigned char *tag)
+static int prog_fd_by_nametag(void *nametag, int **fds, bool tag)
 {
 	unsigned int id = 0;
+	int fd, nb_fds = 0;
+	void *tmp;
 	int err;
-	int fd;
 
 	while (true) {
 		struct bpf_prog_info info = {};
@@ -89,36 +95,54 @@ static int prog_fd_by_tag(unsigned char *tag)
 
 		err = bpf_prog_get_next_id(id, &id);
 		if (err) {
-			p_err("%s", strerror(errno));
-			return -1;
+			if (errno != ENOENT) {
+				p_err("%s", strerror(errno));
+				goto err_close_fds;
+			}
+			return nb_fds;
 		}
 
 		fd = bpf_prog_get_fd_by_id(id);
 		if (fd < 0) {
 			p_err("can't get prog by id (%u): %s",
 			      id, strerror(errno));
-			return -1;
+			goto err_close_fds;
 		}
 
 		err = bpf_obj_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get prog info (%u): %s",
 			      id, strerror(errno));
-			close(fd);
-			return -1;
+			goto err_close_fd;
 		}
 
-		if (!memcmp(tag, info.tag, BPF_TAG_SIZE))
-			return fd;
+		if ((tag && memcmp(nametag, info.tag, BPF_TAG_SIZE)) ||
+		    (!tag && strncmp(nametag, info.name, BPF_OBJ_NAME_LEN))) {
+			close(fd);
+			continue;
+		}
 
-		close(fd);
+		if (nb_fds > 0) {
+			tmp = realloc(*fds, (nb_fds + 1) * sizeof(int));
+			if (!tmp) {
+				p_err("failed to realloc");
+				goto err_close_fd;
+			}
+			*fds = tmp;
+		}
+		(*fds)[nb_fds++] = fd;
 	}
+
+err_close_fd:
+	close(fd);
+err_close_fds:
+	while (--nb_fds >= 0)
+		close((*fds)[nb_fds]);
+	return -1;
 }
 
-int prog_parse_fd(int *argc, char ***argv)
+static int prog_parse_fds(int *argc, char ***argv, int **fds)
 {
-	int fd;
-
 	if (is_prefix(**argv, "id")) {
 		unsigned int id;
 		char *endptr;
@@ -132,10 +156,12 @@ int prog_parse_fd(int *argc, char ***argv)
 		}
 		NEXT_ARGP();
 
-		fd = bpf_prog_get_fd_by_id(id);
-		if (fd < 0)
+		(*fds)[0] = bpf_prog_get_fd_by_id(id);
+		if ((*fds)[0] < 0) {
 			p_err("get by id (%u): %s", id, strerror(errno));
-		return fd;
+			return -1;
+		}
+		return 1;
 	} else if (is_prefix(**argv, "tag")) {
 		unsigned char tag[BPF_TAG_SIZE];
 
@@ -149,7 +175,20 @@ int prog_parse_fd(int *argc, char ***argv)
 		}
 		NEXT_ARGP();
 
-		return prog_fd_by_tag(tag);
+		return prog_fd_by_nametag(tag, fds, true);
+	} else if (is_prefix(**argv, "name")) {
+		char *name;
+
+		NEXT_ARGP();
+
+		name = **argv;
+		if (strlen(name) > BPF_OBJ_NAME_LEN - 1) {
+			p_err("can't parse name");
+			return -1;
+		}
+		NEXT_ARGP();
+
+		return prog_fd_by_nametag(name, fds, false);
 	} else if (is_prefix(**argv, "pinned")) {
 		char *path;
 
@@ -158,13 +197,43 @@ int prog_parse_fd(int *argc, char ***argv)
 		path = **argv;
 		NEXT_ARGP();
 
-		return open_obj_pinned_any(path, BPF_OBJ_PROG);
+		(*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG);
+		if ((*fds)[0] < 0)
+			return -1;
+		return 1;
 	}
 
-	p_err("expected 'id', 'tag' or 'pinned', got: '%s'?", **argv);
+	p_err("expected 'id', 'tag', 'name' or 'pinned', got: '%s'?", **argv);
 	return -1;
 }
 
+int prog_parse_fd(int *argc, char ***argv)
+{
+	int *fds = NULL;
+	int nb_fds, fd;
+
+	fds = malloc(sizeof(int));
+	if (!fds) {
+		p_err("mem alloc failed");
+		return -1;
+	}
+	nb_fds = prog_parse_fds(argc, argv, &fds);
+	if (nb_fds != 1) {
+		if (nb_fds > 1) {
+			p_err("several programs match this handle");
+			while (nb_fds--)
+				close(fds[nb_fds]);
+		}
+		fd = -1;
+		goto exit_free;
+	}
+
+	fd = fds[0];
+exit_free:
+	free(fds);
+	return fd;
+}
+
 static void show_prog_maps(int fd, u32 num_maps)
 {
 	struct bpf_prog_info info = {};
@@ -194,11 +263,8 @@ static void show_prog_maps(int fd, u32 num_maps)
 	}
 }
 
-static void print_prog_json(struct bpf_prog_info *info, int fd)
+static void print_prog_header_json(struct bpf_prog_info *info)
 {
-	char *memlock;
-
-	jsonw_start_object(json_wtr);
 	jsonw_uint_field(json_wtr, "id", info->id);
 	if (info->type < ARRAY_SIZE(prog_type_name))
 		jsonw_string_field(json_wtr, "type",
@@ -219,7 +285,14 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns);
 		jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt);
 	}
+}
 
+static void print_prog_json(struct bpf_prog_info *info, int fd)
+{
+	char *memlock;
+
+	jsonw_start_object(json_wtr);
+	print_prog_header_json(info);
 	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
 	if (info->load_time) {
@@ -268,10 +341,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 	jsonw_end_object(json_wtr);
 }
 
-static void print_prog_plain(struct bpf_prog_info *info, int fd)
+static void print_prog_header_plain(struct bpf_prog_info *info)
 {
-	char *memlock;
-
 	printf("%u: ", info->id);
 	if (info->type < ARRAY_SIZE(prog_type_name))
 		printf("%s  ", prog_type_name[info->type]);
@@ -289,6 +360,13 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 		printf(" run_time_ns %lld run_cnt %lld",
 		       info->run_time_ns, info->run_cnt);
 	printf("\n");
+}
+
+static void print_prog_plain(struct bpf_prog_info *info, int fd)
+{
+	char *memlock;
+
+	print_prog_header_plain(info);
 
 	if (info->load_time) {
 		char buf[32];
@@ -349,6 +427,40 @@ static int show_prog(int fd)
 	return 0;
 }
 
+static int do_show_subset(int argc, char **argv)
+{
+	int *fds = NULL;
+	int nb_fds, i;
+	int err = -1;
+
+	fds = malloc(sizeof(int));
+	if (!fds) {
+		p_err("mem alloc failed");
+		return -1;
+	}
+	nb_fds = prog_parse_fds(&argc, &argv, &fds);
+	if (nb_fds < 1)
+		goto exit_free;
+
+	if (json_output && nb_fds > 1)
+		jsonw_start_array(json_wtr);	/* root array */
+	for (i = 0; i < nb_fds; i++) {
+		err = show_prog(fds[i]);
+		if (err) {
+			for (; i < nb_fds; i++)
+				close(fds[i]);
+			break;
+		}
+		close(fds[i]);
+	}
+	if (json_output && nb_fds > 1)
+		jsonw_end_array(json_wtr);	/* root array */
+
+exit_free:
+	free(fds);
+	return err;
+}
+
 static int do_show(int argc, char **argv)
 {
 	__u32 id = 0;
@@ -358,15 +470,8 @@ static int do_show(int argc, char **argv)
 	if (show_pinned)
 		build_pinned_obj_table(&prog_table, BPF_OBJ_PROG);
 
-	if (argc == 2) {
-		fd = prog_parse_fd(&argc, &argv);
-		if (fd < 0)
-			return -1;
-
-		err = show_prog(fd);
-		close(fd);
-		return err;
-	}
+	if (argc == 2)
+		return do_show_subset(argc, argv);
 
 	if (argc)
 		return BAD_ARG();
@@ -408,101 +513,32 @@ static int do_show(int argc, char **argv)
 	return err;
 }
 
-static int do_dump(int argc, char **argv)
+static int
+prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
+	  char *filepath, bool opcodes, bool visual, bool linum)
 {
-	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_linfo *prog_linfo = NULL;
-	enum {DUMP_JITED, DUMP_XLATED} mode;
 	const char *disasm_opt = NULL;
-	struct bpf_prog_info *info;
 	struct dump_data dd = {};
 	void *func_info = NULL;
 	struct btf *btf = NULL;
-	char *filepath = NULL;
-	bool opcodes = false;
-	bool visual = false;
 	char func_sig[1024];
 	unsigned char *buf;
-	bool linum = false;
 	__u32 member_len;
-	__u64 arrays;
 	ssize_t n;
 	int fd;
 
-	if (is_prefix(*argv, "jited")) {
-		if (disasm_init())
-			return -1;
-		mode = DUMP_JITED;
-	} else if (is_prefix(*argv, "xlated")) {
-		mode = DUMP_XLATED;
-	} else {
-		p_err("expected 'xlated' or 'jited', got: %s", *argv);
-		return -1;
-	}
-	NEXT_ARG();
-
-	if (argc < 2)
-		usage();
-
-	fd = prog_parse_fd(&argc, &argv);
-	if (fd < 0)
-		return -1;
-
-	if (is_prefix(*argv, "file")) {
-		NEXT_ARG();
-		if (!argc) {
-			p_err("expected file path");
-			return -1;
-		}
-
-		filepath = *argv;
-		NEXT_ARG();
-	} else if (is_prefix(*argv, "opcodes")) {
-		opcodes = true;
-		NEXT_ARG();
-	} else if (is_prefix(*argv, "visual")) {
-		visual = true;
-		NEXT_ARG();
-	} else if (is_prefix(*argv, "linum")) {
-		linum = true;
-		NEXT_ARG();
-	}
-
-	if (argc) {
-		usage();
-		return -1;
-	}
-
-	if (mode == DUMP_JITED)
-		arrays = 1UL << BPF_PROG_INFO_JITED_INSNS;
-	else
-		arrays = 1UL << BPF_PROG_INFO_XLATED_INSNS;
-
-	arrays |= 1UL << BPF_PROG_INFO_JITED_KSYMS;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS;
-	arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_LINE_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO;
-
-	info_linear = bpf_program__get_prog_info_linear(fd, arrays);
-	close(fd);
-	if (IS_ERR_OR_NULL(info_linear)) {
-		p_err("can't get prog info: %s", strerror(errno));
-		return -1;
-	}
-
-	info = &info_linear->info;
 	if (mode == DUMP_JITED) {
 		if (info->jited_prog_len == 0 || !info->jited_prog_insns) {
 			p_info("no instructions returned");
-			goto err_free;
+			return -1;
 		}
 		buf = (unsigned char *)(info->jited_prog_insns);
 		member_len = info->jited_prog_len;
 	} else {	/* DUMP_XLATED */
 		if (info->xlated_prog_len == 0) {
 			p_err("error retrieving insn dump: kernel.kptr_restrict set?");
-			goto err_free;
+			return -1;
 		}
 		buf = (unsigned char *)info->xlated_prog_insns;
 		member_len = info->xlated_prog_len;
@@ -510,7 +546,7 @@ static int do_dump(int argc, char **argv)
 
 	if (info->btf_id && btf__get_from_id(info->btf_id, &btf)) {
 		p_err("failed to get btf");
-		goto err_free;
+		return -1;
 	}
 
 	func_info = (void *)info->func_info;
@@ -526,7 +562,7 @@ static int do_dump(int argc, char **argv)
 		if (fd < 0) {
 			p_err("can't open file %s: %s", filepath,
 			      strerror(errno));
-			goto err_free;
+			return -1;
 		}
 
 		n = write(fd, buf, member_len);
@@ -534,7 +570,7 @@ static int do_dump(int argc, char **argv)
 		if (n != member_len) {
 			p_err("error writing output file: %s",
 			      n < 0 ? strerror(errno) : "short write");
-			goto err_free;
+			return -1;
 		}
 
 		if (json_output)
@@ -548,7 +584,7 @@ static int do_dump(int argc, char **argv)
 						     info->netns_ino,
 						     &disasm_opt);
 			if (!name)
-				goto err_free;
+				return -1;
 		}
 
 		if (info->nr_jited_func_lens && info->jited_func_lens) {
@@ -643,12 +679,130 @@ static int do_dump(int argc, char **argv)
 		kernel_syms_destroy(&dd);
 	}
 
-	free(info_linear);
 	return 0;
+}
 
-err_free:
-	free(info_linear);
-	return -1;
+static int do_dump(int argc, char **argv)
+{
+	struct bpf_prog_info_linear *info_linear;
+	char *filepath = NULL;
+	bool opcodes = false;
+	bool visual = false;
+	enum dump_mode mode;
+	bool linum = false;
+	int *fds = NULL;
+	int nb_fds, i = 0;
+	int err = -1;
+	__u64 arrays;
+
+	if (is_prefix(*argv, "jited")) {
+		if (disasm_init())
+			return -1;
+		mode = DUMP_JITED;
+	} else if (is_prefix(*argv, "xlated")) {
+		mode = DUMP_XLATED;
+	} else {
+		p_err("expected 'xlated' or 'jited', got: %s", *argv);
+		return -1;
+	}
+	NEXT_ARG();
+
+	if (argc < 2)
+		usage();
+
+	fds = malloc(sizeof(int));
+	if (!fds) {
+		p_err("mem alloc failed");
+		return -1;
+	}
+	nb_fds = prog_parse_fds(&argc, &argv, &fds);
+	if (nb_fds < 1)
+		goto exit_free;
+
+	if (is_prefix(*argv, "file")) {
+		NEXT_ARG();
+		if (!argc) {
+			p_err("expected file path");
+			goto exit_close;
+		}
+		if (nb_fds > 1) {
+			p_err("several programs matched");
+			goto exit_close;
+		}
+
+		filepath = *argv;
+		NEXT_ARG();
+	} else if (is_prefix(*argv, "opcodes")) {
+		opcodes = true;
+		NEXT_ARG();
+	} else if (is_prefix(*argv, "visual")) {
+		if (nb_fds > 1) {
+			p_err("several programs matched");
+			goto exit_close;
+		}
+
+		visual = true;
+		NEXT_ARG();
+	} else if (is_prefix(*argv, "linum")) {
+		linum = true;
+		NEXT_ARG();
+	}
+
+	if (argc) {
+		usage();
+		goto exit_close;
+	}
+
+	if (mode == DUMP_JITED)
+		arrays = 1UL << BPF_PROG_INFO_JITED_INSNS;
+	else
+		arrays = 1UL << BPF_PROG_INFO_XLATED_INSNS;
+
+	arrays |= 1UL << BPF_PROG_INFO_JITED_KSYMS;
+	arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS;
+	arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO;
+	arrays |= 1UL << BPF_PROG_INFO_LINE_INFO;
+	arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO;
+
+	if (json_output && nb_fds > 1)
+		jsonw_start_array(json_wtr);	/* root array */
+	for (i = 0; i < nb_fds; i++) {
+		info_linear = bpf_program__get_prog_info_linear(fds[i], arrays);
+		if (IS_ERR_OR_NULL(info_linear)) {
+			p_err("can't get prog info: %s", strerror(errno));
+			break;
+		}
+
+		if (json_output && nb_fds > 1) {
+			jsonw_start_object(json_wtr);	/* prog object */
+			print_prog_header_json(&info_linear->info);
+			jsonw_name(json_wtr, "insns");
+		} else if (nb_fds > 1) {
+			print_prog_header_plain(&info_linear->info);
+		}
+
+		err = prog_dump(&info_linear->info, mode, filepath, opcodes,
+				visual, linum);
+
+		if (json_output && nb_fds > 1)
+			jsonw_end_object(json_wtr);	/* prog object */
+		else if (i != nb_fds - 1 && nb_fds > 1)
+			printf("\n");
+
+		free(info_linear);
+		if (err)
+			break;
+		close(fds[i]);
+	}
+	if (json_output && nb_fds > 1)
+		jsonw_end_array(json_wtr);	/* root array */
+
+exit_close:
+	for (; i < nb_fds; i++)
+		close(fds[i]);
+exit_free:
+	free(fds);
+	return err;
 }
 
 static int do_pin(int argc, char **argv)
diff --git a/tools/include/uapi/asm/bpf_perf_event.h b/tools/include/uapi/asm/bpf_perf_event.h
index 13a58531e6fa..39acc149d843 100644
--- a/tools/include/uapi/asm/bpf_perf_event.h
+++ b/tools/include/uapi/asm/bpf_perf_event.h
@@ -2,6 +2,8 @@
 #include "../../arch/arm64/include/uapi/asm/bpf_perf_event.h"
 #elif defined(__s390__)
 #include "../../arch/s390/include/uapi/asm/bpf_perf_event.h"
+#elif defined(__riscv)
+#include "../../arch/riscv/include/uapi/asm/bpf_perf_event.h"
 #else
 #include <uapi/asm-generic/bpf_perf_event.h>
 #endif
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index dbbcf0b02970..7df436da542d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -231,6 +231,11 @@ enum bpf_attach_type {
  * When children program makes decision (like picking TCP CA or sock bind)
  * parent program has a chance to override it.
  *
+ * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of
+ * programs for a cgroup. Though it's possible to replace an old program at
+ * any position by also specifying BPF_F_REPLACE flag and position itself in
+ * replace_bpf_fd attribute. Old program at this position will be released.
+ *
  * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
  * A cgroup with NONE doesn't allow any programs in sub-cgroups.
  * Ex1:
@@ -249,6 +254,7 @@ enum bpf_attach_type {
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
 #define BPF_F_ALLOW_MULTI	(1U << 1)
+#define BPF_F_REPLACE		(1U << 2)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
@@ -442,6 +448,10 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
+		__u32		replace_bpf_fd;	/* previously attached eBPF
+						 * program to replace if
+						 * BPF_F_REPLACE is used
+						 */
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 63ae4a39e58b..1a2898c482ee 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -22,9 +22,9 @@ struct btf_header {
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x0000ffff
+#define BTF_MAX_TYPE	0x000fffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x0000ffff
+#define BTF_MAX_NAME_OFFSET	0x00ffffff
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN	0xffff
 
@@ -142,7 +142,8 @@ struct btf_param {
 
 enum {
 	BTF_VAR_STATIC = 0,
-	BTF_VAR_GLOBAL_ALLOCATED,
+	BTF_VAR_GLOBAL_ALLOCATED = 1,
+	BTF_VAR_GLOBAL_EXTERN = 2,
 };
 
 /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index defae23a0169..d4790121adf4 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -56,8 +56,8 @@ ifndef VERBOSE
 endif
 
 FEATURE_USER = .libbpf
-FEATURE_TESTS = libelf libelf-mmap bpf reallocarray
-FEATURE_DISPLAY = libelf bpf
+FEATURE_TESTS = libelf libelf-mmap zlib bpf reallocarray
+FEATURE_DISPLAY = libelf zlib bpf
 
 INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi
 FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES)
@@ -147,6 +147,7 @@ TAGS_PROG := $(if $(shell which etags 2>/dev/null),etags,ctags)
 
 GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \
 			   cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
+			   sed 's/\[.*\]//' | \
 			   awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \
 			   sort -u | wc -l)
 VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \
@@ -159,7 +160,7 @@ all: fixdep
 
 all_cmd: $(CMD_TARGETS) check
 
-$(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h
+$(BPF_IN_SHARED): force elfdep zdep bpfdep bpf_helper_defs.h
 	@(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \
 	(diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true
@@ -177,7 +178,7 @@ $(BPF_IN_SHARED): force elfdep bpfdep bpf_helper_defs.h
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true
 	$(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)"
 
-$(BPF_IN_STATIC): force elfdep bpfdep bpf_helper_defs.h
+$(BPF_IN_STATIC): force elfdep zdep bpfdep bpf_helper_defs.h
 	$(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
 
 bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h
@@ -189,7 +190,7 @@ $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
 $(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) \
 		--shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
-		-Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@
+		-Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -lz -o $@
 	@ln -sf $(@F) $(OUTPUT)libbpf.so
 	@ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION)
 
@@ -213,6 +214,7 @@ check_abi: $(OUTPUT)libbpf.so
 		     "versioned in $(VERSION_SCRIPT)." >&2;		 \
 		readelf -s --wide $(BPF_IN_SHARED) |			 \
 		    cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' |	 \
+		    sed 's/\[.*\]//' |					 \
 		    awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'|  \
 		    sort -u > $(OUTPUT)libbpf_global_syms.tmp;		 \
 		readelf -s --wide $(OUTPUT)libbpf.so |			 \
@@ -249,6 +251,7 @@ install_headers: bpf_helper_defs.h
 		$(call do_install,libbpf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,btf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \
+		$(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \
 		$(call do_install,xsk.h,$(prefix)/include/bpf,644); \
 		$(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \
 		$(call do_install,bpf_helper_defs.h,$(prefix)/include/bpf,644); \
@@ -277,12 +280,15 @@ clean:
 
 
 
-PHONY += force elfdep bpfdep cscope tags
+PHONY += force elfdep zdep bpfdep cscope tags
 force:
 
 elfdep:
 	@if [ "$(feature-libelf)" != "1" ]; then echo "No libelf found"; exit 1 ; fi
 
+zdep:
+	@if [ "$(feature-zlib)" != "1" ]; then echo "No zlib found"; exit 1 ; fi
+
 bpfdep:
 	@if [ "$(feature-bpf)" != "1" ]; then echo "BPF API too old"; exit 1 ; fi
 
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 98596e15390f..a787d53699c8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -467,13 +467,28 @@ int bpf_obj_get(const char *pathname)
 int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
 		    unsigned int flags)
 {
+	DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts,
+		.flags = flags,
+	);
+
+	return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts);
+}
+
+int bpf_prog_attach_xattr(int prog_fd, int target_fd,
+			  enum bpf_attach_type type,
+			  const struct bpf_prog_attach_opts *opts)
+{
 	union bpf_attr attr;
 
+	if (!OPTS_VALID(opts, bpf_prog_attach_opts))
+		return -EINVAL;
+
 	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	   = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
-	attr.attach_flags  = flags;
+	attr.attach_flags  = OPTS_GET(opts, flags, 0);
+	attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0);
 
 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 3c791fa8e68e..f0ab8519986e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -28,14 +28,12 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "libbpf_common.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#ifndef LIBBPF_API
-#define LIBBPF_API __attribute__((visibility("default")))
-#endif
-
 struct bpf_create_map_attr {
 	const char *name;
 	enum bpf_map_type map_type;
@@ -128,8 +126,19 @@ LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 LIBBPF_API int bpf_map_freeze(int fd);
 LIBBPF_API int bpf_obj_pin(int fd, const char *pathname);
 LIBBPF_API int bpf_obj_get(const char *pathname);
+
+struct bpf_prog_attach_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+	unsigned int flags;
+	int replace_prog_fd;
+};
+#define bpf_prog_attach_opts__last_field replace_prog_fd
+
 LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
 			       enum bpf_attach_type type, unsigned int flags);
+LIBBPF_API int bpf_prog_attach_xattr(int prog_fd, int attachable_fd,
+				     enum bpf_attach_type type,
+				     const struct bpf_prog_attach_opts *opts);
 LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
 LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd,
 				enum bpf_attach_type type);
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 0c7d28292898..f69cc208778a 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -25,6 +25,9 @@
 #ifndef __always_inline
 #define __always_inline __attribute__((always_inline))
 #endif
+#ifndef __weak
+#define __weak __attribute__((weak))
+#endif
 
 /*
  * Helper structure used by eBPF C program
@@ -44,4 +47,12 @@ enum libbpf_pin_type {
 	LIBBPF_PIN_BY_NAME,
 };
 
+enum libbpf_tristate {
+	TRI_NO = 0,
+	TRI_YES = 1,
+	TRI_MODULE = 2,
+};
+
+#define __kconfig __attribute__((section(".kconfig")))
+
 #endif
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 88efa2bb7137..5f04f56e1eb6 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -278,6 +278,45 @@ done:
 	return nelems * size;
 }
 
+int btf__align_of(const struct btf *btf, __u32 id)
+{
+	const struct btf_type *t = btf__type_by_id(btf, id);
+	__u16 kind = btf_kind(t);
+
+	switch (kind) {
+	case BTF_KIND_INT:
+	case BTF_KIND_ENUM:
+		return min(sizeof(void *), t->size);
+	case BTF_KIND_PTR:
+		return sizeof(void *);
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+		return btf__align_of(btf, t->type);
+	case BTF_KIND_ARRAY:
+		return btf__align_of(btf, btf_array(t)->type);
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION: {
+		const struct btf_member *m = btf_members(t);
+		__u16 vlen = btf_vlen(t);
+		int i, max_align = 1, align;
+
+		for (i = 0; i < vlen; i++, m++) {
+			align = btf__align_of(btf, m->type);
+			if (align <= 0)
+				return align;
+			max_align = max(max_align, align);
+		}
+
+		return max_align;
+	}
+	default:
+		pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t));
+		return 0;
+	}
+}
+
 int btf__resolve_type(const struct btf *btf, __u32 type_id)
 {
 	const struct btf_type *t;
@@ -539,6 +578,12 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf,
 		return -ENOENT;
 	}
 
+	/* .extern datasec size and var offsets were set correctly during
+	 * extern collection step, so just skip straight to sorting variables
+	 */
+	if (t->size)
+		goto sort_vars;
+
 	ret = bpf_object__section_size(obj, name, &size);
 	if (ret || !size || (t->size && t->size != size)) {
 		pr_debug("Invalid size for section %s: %u bytes\n", name, size);
@@ -575,7 +620,8 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf,
 		vsi->offset = off;
 	}
 
-	qsort(t + 1, vars, sizeof(*vsi), compare_vsi_off);
+sort_vars:
+	qsort(btf_var_secinfos(t), vars, sizeof(*vsi), compare_vsi_off);
 	return 0;
 }
 
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index d9ac73a02cde..8d73f7f5551f 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -8,14 +8,12 @@
 #include <linux/btf.h>
 #include <linux/types.h>
 
+#include "libbpf_common.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#ifndef LIBBPF_API
-#define LIBBPF_API __attribute__((visibility("default")))
-#endif
-
 #define BTF_ELF_SEC ".BTF"
 #define BTF_EXT_ELF_SEC ".BTF.ext"
 #define MAPS_ELF_SEC ".maps"
@@ -79,6 +77,7 @@ LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf,
 						  __u32 id);
 LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
+LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id);
 LIBBPF_API int btf__fd(const struct btf *btf);
 LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
@@ -127,6 +126,28 @@ LIBBPF_API void btf_dump__free(struct btf_dump *d);
 
 LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id);
 
+struct btf_dump_emit_type_decl_opts {
+	/* size of this struct, for forward/backward compatiblity */
+	size_t sz;
+	/* optional field name for type declaration, e.g.:
+	 * - struct my_struct <FNAME>
+	 * - void (*<FNAME>)(int)
+	 * - char (*<FNAME>)[123]
+	 */
+	const char *field_name;
+	/* extra indentation level (in number of tabs) to emit for multi-line
+	 * type declarations (e.g., anonymous struct); applies for lines
+	 * starting from the second one (first line is assumed to have
+	 * necessary indentation already
+	 */
+	int indent_level;
+};
+#define btf_dump_emit_type_decl_opts__last_field indent_level
+
+LIBBPF_API int
+btf_dump__emit_type_decl(struct btf_dump *d, __u32 id,
+			 const struct btf_dump_emit_type_decl_opts *opts);
+
 /*
  * A set of helpers for easier BTF types handling
  */
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index cb126d8fcf75..e95f7710f210 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -116,6 +116,8 @@ static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...)
 	va_end(args);
 }
 
+static int btf_dump_mark_referenced(struct btf_dump *d);
+
 struct btf_dump *btf_dump__new(const struct btf *btf,
 			       const struct btf_ext *btf_ext,
 			       const struct btf_dump_opts *opts,
@@ -137,18 +139,39 @@ struct btf_dump *btf_dump__new(const struct btf *btf,
 	if (IS_ERR(d->type_names)) {
 		err = PTR_ERR(d->type_names);
 		d->type_names = NULL;
-		btf_dump__free(d);
-		return ERR_PTR(err);
 	}
 	d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL);
 	if (IS_ERR(d->ident_names)) {
 		err = PTR_ERR(d->ident_names);
 		d->ident_names = NULL;
-		btf_dump__free(d);
-		return ERR_PTR(err);
+		goto err;
+	}
+	d->type_states = calloc(1 + btf__get_nr_types(d->btf),
+				sizeof(d->type_states[0]));
+	if (!d->type_states) {
+		err = -ENOMEM;
+		goto err;
+	}
+	d->cached_names = calloc(1 + btf__get_nr_types(d->btf),
+				 sizeof(d->cached_names[0]));
+	if (!d->cached_names) {
+		err = -ENOMEM;
+		goto err;
 	}
 
+	/* VOID is special */
+	d->type_states[0].order_state = ORDERED;
+	d->type_states[0].emit_state = EMITTED;
+
+	/* eagerly determine referenced types for anon enums */
+	err = btf_dump_mark_referenced(d);
+	if (err)
+		goto err;
+
 	return d;
+err:
+	btf_dump__free(d);
+	return ERR_PTR(err);
 }
 
 void btf_dump__free(struct btf_dump *d)
@@ -175,7 +198,6 @@ void btf_dump__free(struct btf_dump *d)
 	free(d);
 }
 
-static int btf_dump_mark_referenced(struct btf_dump *d);
 static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr);
 static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id);
 
@@ -202,27 +224,6 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id)
 	if (id > btf__get_nr_types(d->btf))
 		return -EINVAL;
 
-	/* type states are lazily allocated, as they might not be needed */
-	if (!d->type_states) {
-		d->type_states = calloc(1 + btf__get_nr_types(d->btf),
-					sizeof(d->type_states[0]));
-		if (!d->type_states)
-			return -ENOMEM;
-		d->cached_names = calloc(1 + btf__get_nr_types(d->btf),
-					 sizeof(d->cached_names[0]));
-		if (!d->cached_names)
-			return -ENOMEM;
-
-		/* VOID is special */
-		d->type_states[0].order_state = ORDERED;
-		d->type_states[0].emit_state = EMITTED;
-
-		/* eagerly determine referenced types for anon enums */
-		err = btf_dump_mark_referenced(d);
-		if (err)
-			return err;
-	}
-
 	d->emit_queue_cnt = 0;
 	err = btf_dump_order_type(d, id, false);
 	if (err < 0)
@@ -752,41 +753,6 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 	}
 }
 
-static int btf_align_of(const struct btf *btf, __u32 id)
-{
-	const struct btf_type *t = btf__type_by_id(btf, id);
-	__u16 kind = btf_kind(t);
-
-	switch (kind) {
-	case BTF_KIND_INT:
-	case BTF_KIND_ENUM:
-		return min(sizeof(void *), t->size);
-	case BTF_KIND_PTR:
-		return sizeof(void *);
-	case BTF_KIND_TYPEDEF:
-	case BTF_KIND_VOLATILE:
-	case BTF_KIND_CONST:
-	case BTF_KIND_RESTRICT:
-		return btf_align_of(btf, t->type);
-	case BTF_KIND_ARRAY:
-		return btf_align_of(btf, btf_array(t)->type);
-	case BTF_KIND_STRUCT:
-	case BTF_KIND_UNION: {
-		const struct btf_member *m = btf_members(t);
-		__u16 vlen = btf_vlen(t);
-		int i, align = 1;
-
-		for (i = 0; i < vlen; i++, m++)
-			align = max(align, btf_align_of(btf, m->type));
-
-		return align;
-	}
-	default:
-		pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t));
-		return 1;
-	}
-}
-
 static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 				 const struct btf_type *t)
 {
@@ -794,18 +760,18 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 	int align, i, bit_sz;
 	__u16 vlen;
 
-	align = btf_align_of(btf, id);
+	align = btf__align_of(btf, id);
 	/* size of a non-packed struct has to be a multiple of its alignment*/
-	if (t->size % align)
+	if (align && t->size % align)
 		return true;
 
 	m = btf_members(t);
 	vlen = btf_vlen(t);
 	/* all non-bitfield fields have to be naturally aligned */
 	for (i = 0; i < vlen; i++, m++) {
-		align = btf_align_of(btf, m->type);
+		align = btf__align_of(btf, m->type);
 		bit_sz = btf_member_bitfield_size(t, i);
-		if (bit_sz == 0 && m->offset % (8 * align) != 0)
+		if (align && bit_sz == 0 && m->offset % (8 * align) != 0)
 			return true;
 	}
 
@@ -889,7 +855,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 		fname = btf_name_of(d, m->name_off);
 		m_sz = btf_member_bitfield_size(t, i);
 		m_off = btf_member_bit_offset(t, i);
-		align = packed ? 1 : btf_align_of(d->btf, m->type);
+		align = packed ? 1 : btf__align_of(d->btf, m->type);
 
 		btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1);
 		btf_dump_printf(d, "\n%s", pfx(lvl + 1));
@@ -907,7 +873,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 
 	/* pad at the end, if necessary */
 	if (is_struct) {
-		align = packed ? 1 : btf_align_of(d->btf, id);
+		align = packed ? 1 : btf__align_of(d->btf, id);
 		btf_dump_emit_bit_padding(d, off, t->size * 8, 0, align,
 					  lvl + 1);
 	}
@@ -1051,6 +1017,21 @@ static int btf_dump_push_decl_stack_id(struct btf_dump *d, __u32 id)
  * of a stack frame. Some care is required to "pop" stack frames after
  * processing type declaration chain.
  */
+int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id,
+			     const struct btf_dump_emit_type_decl_opts *opts)
+{
+	const char *fname;
+	int lvl;
+
+	if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts))
+		return -EINVAL;
+
+	fname = OPTS_GET(opts, field_name, NULL);
+	lvl = OPTS_GET(opts, indent_level, 0);
+	btf_dump_emit_type_decl(d, id, fname, lvl);
+	return 0;
+}
+
 static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id,
 				    const char *fname, int lvl)
 {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3f09772192f1..7513165b104f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -18,6 +18,7 @@
 #include <stdarg.h>
 #include <libgen.h>
 #include <inttypes.h>
+#include <limits.h>
 #include <string.h>
 #include <unistd.h>
 #include <endian.h>
@@ -41,9 +42,11 @@
 #include <sys/types.h>
 #include <sys/vfs.h>
 #include <sys/utsname.h>
+#include <sys/resource.h>
 #include <tools/libc_compat.h>
 #include <libelf.h>
 #include <gelf.h>
+#include <zlib.h>
 
 #include "libbpf.h"
 #include "bpf.h"
@@ -99,14 +102,33 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...)
 	va_end(args);
 }
 
-#define STRERR_BUFSIZE  128
+static void pr_perm_msg(int err)
+{
+	struct rlimit limit;
+	char buf[100];
+
+	if (err != -EPERM || geteuid() != 0)
+		return;
+
+	err = getrlimit(RLIMIT_MEMLOCK, &limit);
+	if (err)
+		return;
+
+	if (limit.rlim_cur == RLIM_INFINITY)
+		return;
+
+	if (limit.rlim_cur < 1024)
+		snprintf(buf, sizeof(buf), "%zu bytes", (size_t)limit.rlim_cur);
+	else if (limit.rlim_cur < 1024*1024)
+		snprintf(buf, sizeof(buf), "%.1f KiB", (double)limit.rlim_cur / 1024);
+	else
+		snprintf(buf, sizeof(buf), "%.1f MiB", (double)limit.rlim_cur / (1024*1024));
 
-#define CHECK_ERR(action, err, out) do {	\
-	err = action;			\
-	if (err)			\
-		goto out;		\
-} while (0)
+	pr_warn("permission error while running as root; try raising 'ulimit -l'? current value: %s\n",
+		buf);
+}
 
+#define STRERR_BUFSIZE  128
 
 /* Copied from tools/perf/util/util.h */
 #ifndef zfree
@@ -146,6 +168,20 @@ struct bpf_capabilities {
 	__u32 array_mmap:1;
 };
 
+enum reloc_type {
+	RELO_LD64,
+	RELO_CALL,
+	RELO_DATA,
+	RELO_EXTERN,
+};
+
+struct reloc_desc {
+	enum reloc_type type;
+	int insn_idx;
+	int map_idx;
+	int sym_off;
+};
+
 /*
  * bpf_prog should be a better name but it has been used in
  * linux/filter.h.
@@ -164,16 +200,7 @@ struct bpf_program {
 	size_t insns_cnt, main_prog_cnt;
 	enum bpf_prog_type type;
 
-	struct reloc_desc {
-		enum {
-			RELO_LD64,
-			RELO_CALL,
-			RELO_DATA,
-		} type;
-		int insn_idx;
-		int map_idx;
-		int sym_off;
-	} *reloc_desc;
+	struct reloc_desc *reloc_desc;
 	int nr_reloc;
 	int log_level;
 
@@ -202,22 +229,29 @@ struct bpf_program {
 	__u32 prog_flags;
 };
 
+#define DATA_SEC ".data"
+#define BSS_SEC ".bss"
+#define RODATA_SEC ".rodata"
+#define KCONFIG_SEC ".kconfig"
+
 enum libbpf_map_type {
 	LIBBPF_MAP_UNSPEC,
 	LIBBPF_MAP_DATA,
 	LIBBPF_MAP_BSS,
 	LIBBPF_MAP_RODATA,
+	LIBBPF_MAP_KCONFIG,
 };
 
 static const char * const libbpf_type_to_btf_name[] = {
-	[LIBBPF_MAP_DATA]	= ".data",
-	[LIBBPF_MAP_BSS]	= ".bss",
-	[LIBBPF_MAP_RODATA]	= ".rodata",
+	[LIBBPF_MAP_DATA]	= DATA_SEC,
+	[LIBBPF_MAP_BSS]	= BSS_SEC,
+	[LIBBPF_MAP_RODATA]	= RODATA_SEC,
+	[LIBBPF_MAP_KCONFIG]	= KCONFIG_SEC,
 };
 
 struct bpf_map {
-	int fd;
 	char *name;
+	int fd;
 	int sec_idx;
 	size_t sec_offset;
 	int map_ifindex;
@@ -228,14 +262,32 @@ struct bpf_map {
 	void *priv;
 	bpf_map_clear_priv_t clear_priv;
 	enum libbpf_map_type libbpf_type;
+	void *mmaped;
 	char *pin_path;
 	bool pinned;
 	bool reused;
 };
 
-struct bpf_secdata {
-	void *rodata;
-	void *data;
+enum extern_type {
+	EXT_UNKNOWN,
+	EXT_CHAR,
+	EXT_BOOL,
+	EXT_INT,
+	EXT_TRISTATE,
+	EXT_CHAR_ARR,
+};
+
+struct extern_desc {
+	const char *name;
+	int sym_idx;
+	int btf_id;
+	enum extern_type type;
+	int sz;
+	int align;
+	int data_off;
+	bool is_signed;
+	bool is_weak;
+	bool is_set;
 };
 
 static LIST_HEAD(bpf_objects_list);
@@ -250,7 +302,11 @@ struct bpf_object {
 	struct bpf_map *maps;
 	size_t nr_maps;
 	size_t maps_cap;
-	struct bpf_secdata sections;
+
+	char *kconfig;
+	struct extern_desc *externs;
+	int nr_extern;
+	int kconfig_map_idx;
 
 	bool loaded;
 	bool has_pseudo_calls;
@@ -279,6 +335,7 @@ struct bpf_object {
 		int maps_shndx;
 		int btf_maps_shndx;
 		int text_shndx;
+		int symbols_shndx;
 		int data_shndx;
 		int rodata_shndx;
 		int bss_shndx;
@@ -550,6 +607,7 @@ static struct bpf_object *bpf_object__new(const char *path,
 	obj->efile.data_shndx = -1;
 	obj->efile.rodata_shndx = -1;
 	obj->efile.bss_shndx = -1;
+	obj->kconfig_map_idx = -1;
 
 	obj->kern_version = get_kernel_version();
 	obj->loaded = false;
@@ -748,13 +806,13 @@ int bpf_object__section_size(const struct bpf_object *obj, const char *name,
 	*size = 0;
 	if (!name) {
 		return -EINVAL;
-	} else if (!strcmp(name, ".data")) {
+	} else if (!strcmp(name, DATA_SEC)) {
 		if (obj->efile.data)
 			*size = obj->efile.data->d_size;
-	} else if (!strcmp(name, ".bss")) {
+	} else if (!strcmp(name, BSS_SEC)) {
 		if (obj->efile.bss)
 			*size = obj->efile.bss->d_size;
-	} else if (!strcmp(name, ".rodata")) {
+	} else if (!strcmp(name, RODATA_SEC)) {
 		if (obj->efile.rodata)
 			*size = obj->efile.rodata->d_size;
 	} else {
@@ -835,13 +893,38 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj)
 	return &obj->maps[obj->nr_maps++];
 }
 
+static size_t bpf_map_mmap_sz(const struct bpf_map *map)
+{
+	long page_sz = sysconf(_SC_PAGE_SIZE);
+	size_t map_sz;
+
+	map_sz = roundup(map->def.value_size, 8) * map->def.max_entries;
+	map_sz = roundup(map_sz, page_sz);
+	return map_sz;
+}
+
+static char *internal_map_name(struct bpf_object *obj,
+			       enum libbpf_map_type type)
+{
+	char map_name[BPF_OBJ_NAME_LEN];
+	const char *sfx = libbpf_type_to_btf_name[type];
+	int sfx_len = max((size_t)7, strlen(sfx));
+	int pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1,
+			  strlen(obj->name));
+
+	snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name,
+		 sfx_len, libbpf_type_to_btf_name[type]);
+
+	return strdup(map_name);
+}
+
 static int
 bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
-			      int sec_idx, Elf_Data *data, void **data_buff)
+			      int sec_idx, void *data, size_t data_sz)
 {
-	char map_name[BPF_OBJ_NAME_LEN];
 	struct bpf_map_def *def;
 	struct bpf_map *map;
+	int err;
 
 	map = bpf_object__add_map(obj);
 	if (IS_ERR(map))
@@ -850,9 +933,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
 	map->libbpf_type = type;
 	map->sec_idx = sec_idx;
 	map->sec_offset = 0;
-	snprintf(map_name, sizeof(map_name), "%.8s%.7s", obj->name,
-		 libbpf_type_to_btf_name[type]);
-	map->name = strdup(map_name);
+	map->name = internal_map_name(obj, type);
 	if (!map->name) {
 		pr_warn("failed to alloc map name\n");
 		return -ENOMEM;
@@ -861,25 +942,29 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
 	def = &map->def;
 	def->type = BPF_MAP_TYPE_ARRAY;
 	def->key_size = sizeof(int);
-	def->value_size = data->d_size;
+	def->value_size = data_sz;
 	def->max_entries = 1;
-	def->map_flags = type == LIBBPF_MAP_RODATA ? BPF_F_RDONLY_PROG : 0;
-	if (obj->caps.array_mmap)
-		def->map_flags |= BPF_F_MMAPABLE;
+	def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG
+			 ? BPF_F_RDONLY_PROG : 0;
+	def->map_flags |= BPF_F_MMAPABLE;
 
 	pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n",
-		 map_name, map->sec_idx, map->sec_offset, def->map_flags);
+		 map->name, map->sec_idx, map->sec_offset, def->map_flags);
 
-	if (data_buff) {
-		*data_buff = malloc(data->d_size);
-		if (!*data_buff) {
-			zfree(&map->name);
-			pr_warn("failed to alloc map content buffer\n");
-			return -ENOMEM;
-		}
-		memcpy(*data_buff, data->d_buf, data->d_size);
+	map->mmaped = mmap(NULL, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE,
+			   MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	if (map->mmaped == MAP_FAILED) {
+		err = -errno;
+		map->mmaped = NULL;
+		pr_warn("failed to alloc map '%s' content buffer: %d\n",
+			map->name, err);
+		zfree(&map->name);
+		return err;
 	}
 
+	if (data)
+		memcpy(map->mmaped, data, data_sz);
+
 	pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name);
 	return 0;
 }
@@ -888,37 +973,332 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj)
 {
 	int err;
 
-	if (!obj->caps.global_data)
-		return 0;
 	/*
 	 * Populate obj->maps with libbpf internal maps.
 	 */
 	if (obj->efile.data_shndx >= 0) {
 		err = bpf_object__init_internal_map(obj, LIBBPF_MAP_DATA,
 						    obj->efile.data_shndx,
-						    obj->efile.data,
-						    &obj->sections.data);
+						    obj->efile.data->d_buf,
+						    obj->efile.data->d_size);
 		if (err)
 			return err;
 	}
 	if (obj->efile.rodata_shndx >= 0) {
 		err = bpf_object__init_internal_map(obj, LIBBPF_MAP_RODATA,
 						    obj->efile.rodata_shndx,
-						    obj->efile.rodata,
-						    &obj->sections.rodata);
+						    obj->efile.rodata->d_buf,
+						    obj->efile.rodata->d_size);
 		if (err)
 			return err;
 	}
 	if (obj->efile.bss_shndx >= 0) {
 		err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS,
 						    obj->efile.bss_shndx,
-						    obj->efile.bss, NULL);
+						    NULL,
+						    obj->efile.bss->d_size);
 		if (err)
 			return err;
 	}
 	return 0;
 }
 
+
+static struct extern_desc *find_extern_by_name(const struct bpf_object *obj,
+					       const void *name)
+{
+	int i;
+
+	for (i = 0; i < obj->nr_extern; i++) {
+		if (strcmp(obj->externs[i].name, name) == 0)
+			return &obj->externs[i];
+	}
+	return NULL;
+}
+
+static int set_ext_value_tri(struct extern_desc *ext, void *ext_val,
+			     char value)
+{
+	switch (ext->type) {
+	case EXT_BOOL:
+		if (value == 'm') {
+			pr_warn("extern %s=%c should be tristate or char\n",
+				ext->name, value);
+			return -EINVAL;
+		}
+		*(bool *)ext_val = value == 'y' ? true : false;
+		break;
+	case EXT_TRISTATE:
+		if (value == 'y')
+			*(enum libbpf_tristate *)ext_val = TRI_YES;
+		else if (value == 'm')
+			*(enum libbpf_tristate *)ext_val = TRI_MODULE;
+		else /* value == 'n' */
+			*(enum libbpf_tristate *)ext_val = TRI_NO;
+		break;
+	case EXT_CHAR:
+		*(char *)ext_val = value;
+		break;
+	case EXT_UNKNOWN:
+	case EXT_INT:
+	case EXT_CHAR_ARR:
+	default:
+		pr_warn("extern %s=%c should be bool, tristate, or char\n",
+			ext->name, value);
+		return -EINVAL;
+	}
+	ext->is_set = true;
+	return 0;
+}
+
+static int set_ext_value_str(struct extern_desc *ext, char *ext_val,
+			     const char *value)
+{
+	size_t len;
+
+	if (ext->type != EXT_CHAR_ARR) {
+		pr_warn("extern %s=%s should char array\n", ext->name, value);
+		return -EINVAL;
+	}
+
+	len = strlen(value);
+	if (value[len - 1] != '"') {
+		pr_warn("extern '%s': invalid string config '%s'\n",
+			ext->name, value);
+		return -EINVAL;
+	}
+
+	/* strip quotes */
+	len -= 2;
+	if (len >= ext->sz) {
+		pr_warn("extern '%s': long string config %s of (%zu bytes) truncated to %d bytes\n",
+			ext->name, value, len, ext->sz - 1);
+		len = ext->sz - 1;
+	}
+	memcpy(ext_val, value + 1, len);
+	ext_val[len] = '\0';
+	ext->is_set = true;
+	return 0;
+}
+
+static int parse_u64(const char *value, __u64 *res)
+{
+	char *value_end;
+	int err;
+
+	errno = 0;
+	*res = strtoull(value, &value_end, 0);
+	if (errno) {
+		err = -errno;
+		pr_warn("failed to parse '%s' as integer: %d\n", value, err);
+		return err;
+	}
+	if (*value_end) {
+		pr_warn("failed to parse '%s' as integer completely\n", value);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static bool is_ext_value_in_range(const struct extern_desc *ext, __u64 v)
+{
+	int bit_sz = ext->sz * 8;
+
+	if (ext->sz == 8)
+		return true;
+
+	/* Validate that value stored in u64 fits in integer of `ext->sz`
+	 * bytes size without any loss of information. If the target integer
+	 * is signed, we rely on the following limits of integer type of
+	 * Y bits and subsequent transformation:
+	 *
+	 *     -2^(Y-1) <= X           <= 2^(Y-1) - 1
+	 *            0 <= X + 2^(Y-1) <= 2^Y - 1
+	 *            0 <= X + 2^(Y-1) <  2^Y
+	 *
+	 *  For unsigned target integer, check that all the (64 - Y) bits are
+	 *  zero.
+	 */
+	if (ext->is_signed)
+		return v + (1ULL << (bit_sz - 1)) < (1ULL << bit_sz);
+	else
+		return (v >> bit_sz) == 0;
+}
+
+static int set_ext_value_num(struct extern_desc *ext, void *ext_val,
+			     __u64 value)
+{
+	if (ext->type != EXT_INT && ext->type != EXT_CHAR) {
+		pr_warn("extern %s=%llu should be integer\n",
+			ext->name, (unsigned long long)value);
+		return -EINVAL;
+	}
+	if (!is_ext_value_in_range(ext, value)) {
+		pr_warn("extern %s=%llu value doesn't fit in %d bytes\n",
+			ext->name, (unsigned long long)value, ext->sz);
+		return -ERANGE;
+	}
+	switch (ext->sz) {
+		case 1: *(__u8 *)ext_val = value; break;
+		case 2: *(__u16 *)ext_val = value; break;
+		case 4: *(__u32 *)ext_val = value; break;
+		case 8: *(__u64 *)ext_val = value; break;
+		default:
+			return -EINVAL;
+	}
+	ext->is_set = true;
+	return 0;
+}
+
+static int bpf_object__process_kconfig_line(struct bpf_object *obj,
+					    char *buf, void *data)
+{
+	struct extern_desc *ext;
+	char *sep, *value;
+	int len, err = 0;
+	void *ext_val;
+	__u64 num;
+
+	if (strncmp(buf, "CONFIG_", 7))
+		return 0;
+
+	sep = strchr(buf, '=');
+	if (!sep) {
+		pr_warn("failed to parse '%s': no separator\n", buf);
+		return -EINVAL;
+	}
+
+	/* Trim ending '\n' */
+	len = strlen(buf);
+	if (buf[len - 1] == '\n')
+		buf[len - 1] = '\0';
+	/* Split on '=' and ensure that a value is present. */
+	*sep = '\0';
+	if (!sep[1]) {
+		*sep = '=';
+		pr_warn("failed to parse '%s': no value\n", buf);
+		return -EINVAL;
+	}
+
+	ext = find_extern_by_name(obj, buf);
+	if (!ext || ext->is_set)
+		return 0;
+
+	ext_val = data + ext->data_off;
+	value = sep + 1;
+
+	switch (*value) {
+	case 'y': case 'n': case 'm':
+		err = set_ext_value_tri(ext, ext_val, *value);
+		break;
+	case '"':
+		err = set_ext_value_str(ext, ext_val, value);
+		break;
+	default:
+		/* assume integer */
+		err = parse_u64(value, &num);
+		if (err) {
+			pr_warn("extern %s=%s should be integer\n",
+				ext->name, value);
+			return err;
+		}
+		err = set_ext_value_num(ext, ext_val, num);
+		break;
+	}
+	if (err)
+		return err;
+	pr_debug("extern %s=%s\n", ext->name, value);
+	return 0;
+}
+
+static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data)
+{
+	char buf[PATH_MAX];
+	struct utsname uts;
+	int len, err = 0;
+	gzFile file;
+
+	uname(&uts);
+	len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release);
+	if (len < 0)
+		return -EINVAL;
+	else if (len >= PATH_MAX)
+		return -ENAMETOOLONG;
+
+	/* gzopen also accepts uncompressed files. */
+	file = gzopen(buf, "r");
+	if (!file)
+		file = gzopen("/proc/config.gz", "r");
+
+	if (!file) {
+		pr_warn("failed to open system Kconfig\n");
+		return -ENOENT;
+	}
+
+	while (gzgets(file, buf, sizeof(buf))) {
+		err = bpf_object__process_kconfig_line(obj, buf, data);
+		if (err) {
+			pr_warn("error parsing system Kconfig line '%s': %d\n",
+				buf, err);
+			goto out;
+		}
+	}
+
+out:
+	gzclose(file);
+	return err;
+}
+
+static int bpf_object__read_kconfig_mem(struct bpf_object *obj,
+					const char *config, void *data)
+{
+	char buf[PATH_MAX];
+	int err = 0;
+	FILE *file;
+
+	file = fmemopen((void *)config, strlen(config), "r");
+	if (!file) {
+		err = -errno;
+		pr_warn("failed to open in-memory Kconfig: %d\n", err);
+		return err;
+	}
+
+	while (fgets(buf, sizeof(buf), file)) {
+		err = bpf_object__process_kconfig_line(obj, buf, data);
+		if (err) {
+			pr_warn("error parsing in-memory Kconfig line '%s': %d\n",
+				buf, err);
+			break;
+		}
+	}
+
+	fclose(file);
+	return err;
+}
+
+static int bpf_object__init_kconfig_map(struct bpf_object *obj)
+{
+	struct extern_desc *last_ext;
+	size_t map_sz;
+	int err;
+
+	if (obj->nr_extern == 0)
+		return 0;
+
+	last_ext = &obj->externs[obj->nr_extern - 1];
+	map_sz = last_ext->data_off + last_ext->sz;
+
+	err = bpf_object__init_internal_map(obj, LIBBPF_MAP_KCONFIG,
+					    obj->efile.symbols_shndx,
+					    NULL, map_sz);
+	if (err)
+		return err;
+
+	obj->kconfig_map_idx = obj->nr_maps - 1;
+
+	return 0;
+}
+
 static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict)
 {
 	Elf_Data *symbols = obj->efile.symbols;
@@ -1242,15 +1622,15 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 			}
 			sz = btf__resolve_size(obj->btf, t->type);
 			if (sz < 0) {
-				pr_warn("map '%s': can't determine key size for type [%u]: %lld.\n",
-					map_name, t->type, sz);
+				pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n",
+					map_name, t->type, (ssize_t)sz);
 				return sz;
 			}
-			pr_debug("map '%s': found key [%u], sz = %lld.\n",
-				 map_name, t->type, sz);
+			pr_debug("map '%s': found key [%u], sz = %zd.\n",
+				 map_name, t->type, (ssize_t)sz);
 			if (map->def.key_size && map->def.key_size != sz) {
-				pr_warn("map '%s': conflicting key size %u != %lld.\n",
-					map_name, map->def.key_size, sz);
+				pr_warn("map '%s': conflicting key size %u != %zd.\n",
+					map_name, map->def.key_size, (ssize_t)sz);
 				return -EINVAL;
 			}
 			map->def.key_size = sz;
@@ -1285,15 +1665,15 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 			}
 			sz = btf__resolve_size(obj->btf, t->type);
 			if (sz < 0) {
-				pr_warn("map '%s': can't determine value size for type [%u]: %lld.\n",
-					map_name, t->type, sz);
+				pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n",
+					map_name, t->type, (ssize_t)sz);
 				return sz;
 			}
-			pr_debug("map '%s': found value [%u], sz = %lld.\n",
-				 map_name, t->type, sz);
+			pr_debug("map '%s': found value [%u], sz = %zd.\n",
+				 map_name, t->type, (ssize_t)sz);
 			if (map->def.value_size && map->def.value_size != sz) {
-				pr_warn("map '%s': conflicting value size %u != %lld.\n",
-					map_name, map->def.value_size, sz);
+				pr_warn("map '%s': conflicting value size %u != %zd.\n",
+					map_name, map->def.value_size, (ssize_t)sz);
 				return -EINVAL;
 			}
 			map->def.value_size = sz;
@@ -1393,21 +1773,20 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
 	return 0;
 }
 
-static int bpf_object__init_maps(struct bpf_object *obj, bool relaxed_maps,
-				 const char *pin_root_path)
+static int bpf_object__init_maps(struct bpf_object *obj,
+				 const struct bpf_object_open_opts *opts)
 {
-	bool strict = !relaxed_maps;
+	const char *pin_root_path;
+	bool strict;
 	int err;
 
-	err = bpf_object__init_user_maps(obj, strict);
-	if (err)
-		return err;
-
-	err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path);
-	if (err)
-		return err;
+	strict = !OPTS_GET(opts, relaxed_maps, false);
+	pin_root_path = OPTS_GET(opts, pin_root_path, NULL);
 
-	err = bpf_object__init_global_data_maps(obj);
+	err = bpf_object__init_user_maps(obj, strict);
+	err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path);
+	err = err ?: bpf_object__init_global_data_maps(obj);
+	err = err ?: bpf_object__init_kconfig_map(obj);
 	if (err)
 		return err;
 
@@ -1509,7 +1888,8 @@ static void bpf_object__sanitize_btf_ext(struct bpf_object *obj)
 
 static bool bpf_object__is_btf_mandatory(const struct bpf_object *obj)
 {
-	return obj->efile.btf_maps_shndx >= 0;
+	return obj->efile.btf_maps_shndx >= 0 ||
+	       obj->nr_extern > 0;
 }
 
 static int bpf_object__init_btf(struct bpf_object *obj,
@@ -1526,11 +1906,6 @@ static int bpf_object__init_btf(struct bpf_object *obj,
 				BTF_ELF_SEC, err);
 			goto out;
 		}
-		err = btf__finalize_data(obj, obj->btf);
-		if (err) {
-			pr_warn("Error finalizing %s: %d.\n", BTF_ELF_SEC, err);
-			goto out;
-		}
 	}
 	if (btf_ext_data) {
 		if (!obj->btf) {
@@ -1564,6 +1939,30 @@ out:
 	return 0;
 }
 
+static int bpf_object__finalize_btf(struct bpf_object *obj)
+{
+	int err;
+
+	if (!obj->btf)
+		return 0;
+
+	err = btf__finalize_data(obj, obj->btf);
+	if (!err)
+		return 0;
+
+	pr_warn("Error finalizing %s: %d.\n", BTF_ELF_SEC, err);
+	btf__free(obj->btf);
+	obj->btf = NULL;
+	btf_ext__free(obj->btf_ext);
+	obj->btf_ext = NULL;
+
+	if (bpf_object__is_btf_mandatory(obj)) {
+		pr_warn("BTF is required, but is missing or corrupted.\n");
+		return -ENOENT;
+	}
+	return 0;
+}
+
 static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
 {
 	int err = 0;
@@ -1592,8 +1991,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
 	return 0;
 }
 
-static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
-				   const char *pin_root_path)
+static int bpf_object__elf_collect(struct bpf_object *obj)
 {
 	Elf *elf = obj->efile.elf;
 	GElf_Ehdr *ep = &obj->efile.ehdr;
@@ -1665,6 +2063,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 				return -LIBBPF_ERRNO__FORMAT;
 			}
 			obj->efile.symbols = data;
+			obj->efile.symbols_shndx = idx;
 			obj->efile.strtabidx = sh.sh_link;
 		} else if (sh.sh_type == SHT_PROGBITS && data->d_size > 0) {
 			if (sh.sh_flags & SHF_EXECINSTR) {
@@ -1683,10 +2082,10 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 						name, obj->path, cp);
 					return err;
 				}
-			} else if (strcmp(name, ".data") == 0) {
+			} else if (strcmp(name, DATA_SEC) == 0) {
 				obj->efile.data = data;
 				obj->efile.data_shndx = idx;
-			} else if (strcmp(name, ".rodata") == 0) {
+			} else if (strcmp(name, RODATA_SEC) == 0) {
 				obj->efile.rodata = data;
 				obj->efile.rodata_shndx = idx;
 			} else {
@@ -1716,7 +2115,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 
 			obj->efile.reloc_sects[nr_sects].shdr = sh;
 			obj->efile.reloc_sects[nr_sects].data = data;
-		} else if (sh.sh_type == SHT_NOBITS && strcmp(name, ".bss") == 0) {
+		} else if (sh.sh_type == SHT_NOBITS &&
+			   strcmp(name, BSS_SEC) == 0) {
 			obj->efile.bss = data;
 			obj->efile.bss_shndx = idx;
 		} else {
@@ -1728,14 +2128,217 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 		pr_warn("Corrupted ELF file: index of strtab invalid\n");
 		return -LIBBPF_ERRNO__FORMAT;
 	}
-	err = bpf_object__init_btf(obj, btf_data, btf_ext_data);
-	if (!err)
-		err = bpf_object__init_maps(obj, relaxed_maps, pin_root_path);
-	if (!err)
-		err = bpf_object__sanitize_and_load_btf(obj);
-	if (!err)
-		err = bpf_object__init_prog_names(obj);
-	return err;
+	return bpf_object__init_btf(obj, btf_data, btf_ext_data);
+}
+
+static bool sym_is_extern(const GElf_Sym *sym)
+{
+	int bind = GELF_ST_BIND(sym->st_info);
+	/* externs are symbols w/ type=NOTYPE, bind=GLOBAL|WEAK, section=UND */
+	return sym->st_shndx == SHN_UNDEF &&
+	       (bind == STB_GLOBAL || bind == STB_WEAK) &&
+	       GELF_ST_TYPE(sym->st_info) == STT_NOTYPE;
+}
+
+static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
+{
+	const struct btf_type *t;
+	const char *var_name;
+	int i, n;
+
+	if (!btf)
+		return -ESRCH;
+
+	n = btf__get_nr_types(btf);
+	for (i = 1; i <= n; i++) {
+		t = btf__type_by_id(btf, i);
+
+		if (!btf_is_var(t))
+			continue;
+
+		var_name = btf__name_by_offset(btf, t->name_off);
+		if (strcmp(var_name, ext_name))
+			continue;
+
+		if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN)
+			return -EINVAL;
+
+		return i;
+	}
+
+	return -ENOENT;
+}
+
+static enum extern_type find_extern_type(const struct btf *btf, int id,
+					 bool *is_signed)
+{
+	const struct btf_type *t;
+	const char *name;
+
+	t = skip_mods_and_typedefs(btf, id, NULL);
+	name = btf__name_by_offset(btf, t->name_off);
+
+	if (is_signed)
+		*is_signed = false;
+	switch (btf_kind(t)) {
+	case BTF_KIND_INT: {
+		int enc = btf_int_encoding(t);
+
+		if (enc & BTF_INT_BOOL)
+			return t->size == 1 ? EXT_BOOL : EXT_UNKNOWN;
+		if (is_signed)
+			*is_signed = enc & BTF_INT_SIGNED;
+		if (t->size == 1)
+			return EXT_CHAR;
+		if (t->size < 1 || t->size > 8 || (t->size & (t->size - 1)))
+			return EXT_UNKNOWN;
+		return EXT_INT;
+	}
+	case BTF_KIND_ENUM:
+		if (t->size != 4)
+			return EXT_UNKNOWN;
+		if (strcmp(name, "libbpf_tristate"))
+			return EXT_UNKNOWN;
+		return EXT_TRISTATE;
+	case BTF_KIND_ARRAY:
+		if (btf_array(t)->nelems == 0)
+			return EXT_UNKNOWN;
+		if (find_extern_type(btf, btf_array(t)->type, NULL) != EXT_CHAR)
+			return EXT_UNKNOWN;
+		return EXT_CHAR_ARR;
+	default:
+		return EXT_UNKNOWN;
+	}
+}
+
+static int cmp_externs(const void *_a, const void *_b)
+{
+	const struct extern_desc *a = _a;
+	const struct extern_desc *b = _b;
+
+	/* descending order by alignment requirements */
+	if (a->align != b->align)
+		return a->align > b->align ? -1 : 1;
+	/* ascending order by size, within same alignment class */
+	if (a->sz != b->sz)
+		return a->sz < b->sz ? -1 : 1;
+	/* resolve ties by name */
+	return strcmp(a->name, b->name);
+}
+
+static int bpf_object__collect_externs(struct bpf_object *obj)
+{
+	const struct btf_type *t;
+	struct extern_desc *ext;
+	int i, n, off, btf_id;
+	struct btf_type *sec;
+	const char *ext_name;
+	Elf_Scn *scn;
+	GElf_Shdr sh;
+
+	if (!obj->efile.symbols)
+		return 0;
+
+	scn = elf_getscn(obj->efile.elf, obj->efile.symbols_shndx);
+	if (!scn)
+		return -LIBBPF_ERRNO__FORMAT;
+	if (gelf_getshdr(scn, &sh) != &sh)
+		return -LIBBPF_ERRNO__FORMAT;
+	n = sh.sh_size / sh.sh_entsize;
+
+	pr_debug("looking for externs among %d symbols...\n", n);
+	for (i = 0; i < n; i++) {
+		GElf_Sym sym;
+
+		if (!gelf_getsym(obj->efile.symbols, i, &sym))
+			return -LIBBPF_ERRNO__FORMAT;
+		if (!sym_is_extern(&sym))
+			continue;
+		ext_name = elf_strptr(obj->efile.elf, obj->efile.strtabidx,
+				      sym.st_name);
+		if (!ext_name || !ext_name[0])
+			continue;
+
+		ext = obj->externs;
+		ext = reallocarray(ext, obj->nr_extern + 1, sizeof(*ext));
+		if (!ext)
+			return -ENOMEM;
+		obj->externs = ext;
+		ext = &ext[obj->nr_extern];
+		memset(ext, 0, sizeof(*ext));
+		obj->nr_extern++;
+
+		ext->btf_id = find_extern_btf_id(obj->btf, ext_name);
+		if (ext->btf_id <= 0) {
+			pr_warn("failed to find BTF for extern '%s': %d\n",
+				ext_name, ext->btf_id);
+			return ext->btf_id;
+		}
+		t = btf__type_by_id(obj->btf, ext->btf_id);
+		ext->name = btf__name_by_offset(obj->btf, t->name_off);
+		ext->sym_idx = i;
+		ext->is_weak = GELF_ST_BIND(sym.st_info) == STB_WEAK;
+		ext->sz = btf__resolve_size(obj->btf, t->type);
+		if (ext->sz <= 0) {
+			pr_warn("failed to resolve size of extern '%s': %d\n",
+				ext_name, ext->sz);
+			return ext->sz;
+		}
+		ext->align = btf__align_of(obj->btf, t->type);
+		if (ext->align <= 0) {
+			pr_warn("failed to determine alignment of extern '%s': %d\n",
+				ext_name, ext->align);
+			return -EINVAL;
+		}
+		ext->type = find_extern_type(obj->btf, t->type,
+					     &ext->is_signed);
+		if (ext->type == EXT_UNKNOWN) {
+			pr_warn("extern '%s' type is unsupported\n", ext_name);
+			return -ENOTSUP;
+		}
+	}
+	pr_debug("collected %d externs total\n", obj->nr_extern);
+
+	if (!obj->nr_extern)
+		return 0;
+
+	/* sort externs by (alignment, size, name) and calculate their offsets
+	 * within a map */
+	qsort(obj->externs, obj->nr_extern, sizeof(*ext), cmp_externs);
+	off = 0;
+	for (i = 0; i < obj->nr_extern; i++) {
+		ext = &obj->externs[i];
+		ext->data_off = roundup(off, ext->align);
+		off = ext->data_off + ext->sz;
+		pr_debug("extern #%d: symbol %d, off %u, name %s\n",
+			 i, ext->sym_idx, ext->data_off, ext->name);
+	}
+
+	btf_id = btf__find_by_name(obj->btf, KCONFIG_SEC);
+	if (btf_id <= 0) {
+		pr_warn("no BTF info found for '%s' datasec\n", KCONFIG_SEC);
+		return -ESRCH;
+	}
+
+	sec = (struct btf_type *)btf__type_by_id(obj->btf, btf_id);
+	sec->size = off;
+	n = btf_vlen(sec);
+	for (i = 0; i < n; i++) {
+		struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i;
+
+		t = btf__type_by_id(obj->btf, vs->type);
+		ext_name = btf__name_by_offset(obj->btf, t->name_off);
+		ext = find_extern_by_name(obj, ext_name);
+		if (!ext) {
+			pr_warn("failed to find extern definition for BTF var '%s'\n",
+				ext_name);
+			return -ESRCH;
+		}
+		vs->offset = ext->data_off;
+		btf_var(t)->linkage = BTF_VAR_GLOBAL_ALLOCATED;
+	}
+
+	return 0;
 }
 
 static struct bpf_program *
@@ -1765,6 +2368,19 @@ bpf_object__find_program_by_title(const struct bpf_object *obj,
 	return NULL;
 }
 
+struct bpf_program *
+bpf_object__find_program_by_name(const struct bpf_object *obj,
+				 const char *name)
+{
+	struct bpf_program *prog;
+
+	bpf_object__for_each_program(prog, obj) {
+		if (!strcmp(prog->name, name))
+			return prog;
+	}
+	return NULL;
+}
+
 static bool bpf_object__shndx_is_data(const struct bpf_object *obj,
 				      int shndx)
 {
@@ -1789,6 +2405,8 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx)
 		return LIBBPF_MAP_BSS;
 	else if (shndx == obj->efile.rodata_shndx)
 		return LIBBPF_MAP_RODATA;
+	else if (shndx == obj->efile.symbols_shndx)
+		return LIBBPF_MAP_KCONFIG;
 	else
 		return LIBBPF_MAP_UNSPEC;
 }
@@ -1817,7 +2435,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 			return -LIBBPF_ERRNO__RELOC;
 		}
 		if (sym->st_value % 8) {
-			pr_warn("bad call relo offset: %llu\n", (__u64)sym->st_value);
+			pr_warn("bad call relo offset: %zu\n",
+				(size_t)sym->st_value);
 			return -LIBBPF_ERRNO__RELOC;
 		}
 		reloc_desc->type = RELO_CALL;
@@ -1832,6 +2451,30 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 			insn_idx, insn->code);
 		return -LIBBPF_ERRNO__RELOC;
 	}
+
+	if (sym_is_extern(sym)) {
+		int sym_idx = GELF_R_SYM(rel->r_info);
+		int i, n = obj->nr_extern;
+		struct extern_desc *ext;
+
+		for (i = 0; i < n; i++) {
+			ext = &obj->externs[i];
+			if (ext->sym_idx == sym_idx)
+				break;
+		}
+		if (i >= n) {
+			pr_warn("extern relo failed to find extern for sym %d\n",
+				sym_idx);
+			return -LIBBPF_ERRNO__RELOC;
+		}
+		pr_debug("found extern #%d '%s' (sym %d, off %u) for insn %u\n",
+			 i, ext->name, ext->sym_idx, ext->data_off, insn_idx);
+		reloc_desc->type = RELO_EXTERN;
+		reloc_desc->insn_idx = insn_idx;
+		reloc_desc->sym_off = ext->data_off;
+		return 0;
+	}
+
 	if (!shdr_idx || shdr_idx >= SHN_LORESERVE) {
 		pr_warn("invalid relo for \'%s\' in special section 0x%x; forgot to initialize global var?..\n",
 			name, shdr_idx);
@@ -1859,8 +2502,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 			break;
 		}
 		if (map_idx >= nr_maps) {
-			pr_warn("map relo failed to find map for sec %u, off %llu\n",
-				shdr_idx, (__u64)sym->st_value);
+			pr_warn("map relo failed to find map for sec %u, off %zu\n",
+				shdr_idx, (size_t)sym->st_value);
 			return -LIBBPF_ERRNO__RELOC;
 		}
 		reloc_desc->type = RELO_LD64;
@@ -1875,11 +2518,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 		pr_warn("bad data relo against section %u\n", shdr_idx);
 		return -LIBBPF_ERRNO__RELOC;
 	}
-	if (!obj->caps.global_data) {
-		pr_warn("relocation: kernel does not support global \'%s\' variable access in insns[%d]\n",
-			name, insn_idx);
-		return -LIBBPF_ERRNO__RELOC;
-	}
 	for (map_idx = 0; map_idx < nr_maps; map_idx++) {
 		map = &obj->maps[map_idx];
 		if (map->libbpf_type != type)
@@ -1941,9 +2579,9 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
 		name = elf_strptr(obj->efile.elf, obj->efile.strtabidx,
 				  sym.st_name) ? : "<?>";
 
-		pr_debug("relo for shdr %u, symb %llu, value %llu, type %d, bind %d, name %d (\'%s\'), insn %u\n",
-			 (__u32)sym.st_shndx, (__u64)GELF_R_SYM(rel.r_info),
-			 (__u64)sym.st_value, GELF_ST_TYPE(sym.st_info),
+		pr_debug("relo for shdr %u, symb %zu, value %zu, type %d, bind %d, name %d (\'%s\'), insn %u\n",
+			 (__u32)sym.st_shndx, (size_t)GELF_R_SYM(rel.r_info),
+			 (size_t)sym.st_value, GELF_ST_TYPE(sym.st_info),
 			 GELF_ST_BIND(sym.st_info), sym.st_name, name,
 			 insn_idx);
 
@@ -2298,29 +2936,35 @@ bpf_object__reuse_map(struct bpf_map *map)
 static int
 bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
 {
+	enum libbpf_map_type map_type = map->libbpf_type;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	int err, zero = 0;
-	__u8 *data;
 
-	/* Nothing to do here since kernel already zero-initializes .bss map. */
-	if (map->libbpf_type == LIBBPF_MAP_BSS)
+	/* kernel already zero-initializes .bss map. */
+	if (map_type == LIBBPF_MAP_BSS)
 		return 0;
 
-	data = map->libbpf_type == LIBBPF_MAP_DATA ?
-	       obj->sections.data : obj->sections.rodata;
+	err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
+	if (err) {
+		err = -errno;
+		cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
+		pr_warn("Error setting initial map(%s) contents: %s\n",
+			map->name, cp);
+		return err;
+	}
 
-	err = bpf_map_update_elem(map->fd, &zero, data, 0);
-	/* Freeze .rodata map as read-only from syscall side. */
-	if (!err && map->libbpf_type == LIBBPF_MAP_RODATA) {
+	/* Freeze .rodata and .kconfig map as read-only from syscall side. */
+	if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) {
 		err = bpf_map_freeze(map->fd);
 		if (err) {
-			cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
+			err = -errno;
+			cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
 			pr_warn("Error freezing map(%s) as read-only: %s\n",
 				map->name, cp);
-			err = 0;
+			return err;
 		}
 	}
-	return err;
+	return 0;
 }
 
 static int
@@ -2411,6 +3055,7 @@ err_out:
 			cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
 			pr_warn("failed to create map (name: '%s'): %s(%d)\n",
 				map->name, cp, err);
+			pr_perm_msg(err);
 			for (j = 0; j < i; j++)
 				zclose(obj->maps[j].fd);
 			return err;
@@ -2536,6 +3181,21 @@ static bool str_is_empty(const char *s)
 	return !s || !s[0];
 }
 
+static bool is_flex_arr(const struct btf *btf,
+			const struct bpf_core_accessor *acc,
+			const struct btf_array *arr)
+{
+	const struct btf_type *t;
+
+	/* not a flexible array, if not inside a struct or has non-zero size */
+	if (!acc->name || arr->nelems > 0)
+		return false;
+
+	/* has to be the last member of enclosing struct */
+	t = btf__type_by_id(btf, acc->type_id);
+	return acc->idx == btf_vlen(t) - 1;
+}
+
 /*
  * Turn bpf_field_reloc into a low- and high-level spec representation,
  * validating correctness along the way, as well as calculating resulting
@@ -2573,6 +3233,7 @@ static int bpf_core_spec_parse(const struct btf *btf,
 			       struct bpf_core_spec *spec)
 {
 	int access_idx, parsed_len, i;
+	struct bpf_core_accessor *acc;
 	const struct btf_type *t;
 	const char *name;
 	__u32 id;
@@ -2620,6 +3281,7 @@ static int bpf_core_spec_parse(const struct btf *btf,
 			return -EINVAL;
 
 		access_idx = spec->raw_spec[i];
+		acc = &spec->spec[spec->len];
 
 		if (btf_is_composite(t)) {
 			const struct btf_member *m;
@@ -2637,18 +3299,23 @@ static int bpf_core_spec_parse(const struct btf *btf,
 				if (str_is_empty(name))
 					return -EINVAL;
 
-				spec->spec[spec->len].type_id = id;
-				spec->spec[spec->len].idx = access_idx;
-				spec->spec[spec->len].name = name;
+				acc->type_id = id;
+				acc->idx = access_idx;
+				acc->name = name;
 				spec->len++;
 			}
 
 			id = m->type;
 		} else if (btf_is_array(t)) {
 			const struct btf_array *a = btf_array(t);
+			bool flex;
 
 			t = skip_mods_and_typedefs(btf, a->type, &id);
-			if (!t || access_idx >= a->nelems)
+			if (!t)
+				return -EINVAL;
+
+			flex = is_flex_arr(btf, acc - 1, a);
+			if (!flex && access_idx >= a->nelems)
 				return -EINVAL;
 
 			spec->spec[spec->len].type_id = id;
@@ -2953,12 +3620,14 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
 			 */
 			if (i > 0) {
 				const struct btf_array *a;
+				bool flex;
 
 				if (!btf_is_array(targ_type))
 					return 0;
 
 				a = btf_array(targ_type);
-				if (local_acc->idx >= a->nelems)
+				flex = is_flex_arr(targ_btf, targ_acc - 1, a);
+				if (!flex && local_acc->idx >= a->nelems)
 					return 0;
 				if (!skip_mods_and_typedefs(targ_btf, a->type,
 							    &targ_id))
@@ -3142,11 +3811,13 @@ static int bpf_core_reloc_insn(struct bpf_program *prog,
 	insn = &prog->insns[insn_idx];
 	class = BPF_CLASS(insn->code);
 
-	if (class == BPF_ALU || class == BPF_ALU64) {
+	switch (class) {
+	case BPF_ALU:
+	case BPF_ALU64:
 		if (BPF_SRC(insn->code) != BPF_K)
 			return -EINVAL;
 		if (!failed && validate && insn->imm != orig_val) {
-			pr_warn("prog '%s': unexpected insn #%d value: got %u, exp %u -> %u\n",
+			pr_warn("prog '%s': unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n",
 				bpf_program__title(prog, false), insn_idx,
 				insn->imm, orig_val, new_val);
 			return -EINVAL;
@@ -3156,7 +3827,29 @@ static int bpf_core_reloc_insn(struct bpf_program *prog,
 		pr_debug("prog '%s': patched insn #%d (ALU/ALU64)%s imm %u -> %u\n",
 			 bpf_program__title(prog, false), insn_idx,
 			 failed ? " w/ failed reloc" : "", orig_val, new_val);
-	} else {
+		break;
+	case BPF_LDX:
+	case BPF_ST:
+	case BPF_STX:
+		if (!failed && validate && insn->off != orig_val) {
+			pr_warn("prog '%s': unexpected insn #%d (LD/LDX/ST/STX) value: got %u, exp %u -> %u\n",
+				bpf_program__title(prog, false), insn_idx,
+				insn->off, orig_val, new_val);
+			return -EINVAL;
+		}
+		if (new_val > SHRT_MAX) {
+			pr_warn("prog '%s': insn #%d (LD/LDX/ST/STX) value too big: %u\n",
+				bpf_program__title(prog, false), insn_idx,
+				new_val);
+			return -ERANGE;
+		}
+		orig_val = insn->off;
+		insn->off = new_val;
+		pr_debug("prog '%s': patched insn #%d (LD/LDX/ST/STX)%s off %u -> %u\n",
+			 bpf_program__title(prog, false), insn_idx,
+			 failed ? " w/ failed reloc" : "", orig_val, new_val);
+		break;
+	default:
 		pr_warn("prog '%s': trying to relocate unrecognized insn #%d, code:%x, src:%x, dst:%x, off:%x, imm:%x\n",
 			bpf_program__title(prog, false),
 			insn_idx, insn->code, insn->src_reg, insn->dst_reg,
@@ -3559,9 +4252,6 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj,
 	size_t new_cnt;
 	int err;
 
-	if (relo->type != RELO_CALL)
-		return -LIBBPF_ERRNO__RELOC;
-
 	if (prog->idx == obj->efile.text_shndx) {
 		pr_warn("relo in .text insn %d into off %d (insn #%d)\n",
 			relo->insn_idx, relo->sym_off, relo->sym_off / 8);
@@ -3623,27 +4313,37 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
 
 	for (i = 0; i < prog->nr_reloc; i++) {
 		struct reloc_desc *relo = &prog->reloc_desc[i];
+		struct bpf_insn *insn = &prog->insns[relo->insn_idx];
 
-		if (relo->type == RELO_LD64 || relo->type == RELO_DATA) {
-			struct bpf_insn *insn = &prog->insns[relo->insn_idx];
-
-			if (relo->insn_idx + 1 >= (int)prog->insns_cnt) {
-				pr_warn("relocation out of range: '%s'\n",
-					prog->section_name);
-				return -LIBBPF_ERRNO__RELOC;
-			}
+		if (relo->insn_idx + 1 >= (int)prog->insns_cnt) {
+			pr_warn("relocation out of range: '%s'\n",
+				prog->section_name);
+			return -LIBBPF_ERRNO__RELOC;
+		}
 
-			if (relo->type != RELO_DATA) {
-				insn[0].src_reg = BPF_PSEUDO_MAP_FD;
-			} else {
-				insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
-				insn[1].imm = insn[0].imm + relo->sym_off;
-			}
+		switch (relo->type) {
+		case RELO_LD64:
+			insn[0].src_reg = BPF_PSEUDO_MAP_FD;
+			insn[0].imm = obj->maps[relo->map_idx].fd;
+			break;
+		case RELO_DATA:
+			insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
+			insn[1].imm = insn[0].imm + relo->sym_off;
 			insn[0].imm = obj->maps[relo->map_idx].fd;
-		} else if (relo->type == RELO_CALL) {
+			break;
+		case RELO_EXTERN:
+			insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
+			insn[0].imm = obj->maps[obj->kconfig_map_idx].fd;
+			insn[1].imm = relo->sym_off;
+			break;
+		case RELO_CALL:
 			err = bpf_program__reloc_text(prog, obj, relo);
 			if (err)
 				return err;
+			break;
+		default:
+			pr_warn("relo #%d: bad relo type %d\n", i, relo->type);
+			return -EINVAL;
 		}
 	}
 
@@ -3778,6 +4478,7 @@ retry_load:
 	ret = -errno;
 	cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 	pr_warn("load bpf program failed: %s\n", cp);
+	pr_perm_msg(ret);
 
 	if (log_buf && log_buf[0] != '\0') {
 		ret = -LIBBPF_ERRNO__VERIFY;
@@ -3807,11 +4508,22 @@ out:
 	return ret;
 }
 
-int
-bpf_program__load(struct bpf_program *prog,
-		  char *license, __u32 kern_version)
+static int libbpf_find_attach_btf_id(const char *name,
+				     enum bpf_attach_type attach_type,
+				     __u32 attach_prog_fd);
+
+int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
 {
-	int err = 0, fd, i;
+	int err = 0, fd, i, btf_id;
+
+	if (prog->type == BPF_PROG_TYPE_TRACING) {
+		btf_id = libbpf_find_attach_btf_id(prog->section_name,
+						   prog->expected_attach_type,
+						   prog->attach_prog_fd);
+		if (btf_id <= 0)
+			return btf_id;
+		prog->attach_btf_id = btf_id;
+	}
 
 	if (prog->instances.nr < 0 || !prog->instances.fds) {
 		if (prog->preprocessor) {
@@ -3835,7 +4547,7 @@ bpf_program__load(struct bpf_program *prog,
 				prog->section_name, prog->instances.nr);
 		}
 		err = load_program(prog, prog->insns, prog->insns_cnt,
-				   license, kern_version, &fd);
+				   license, kern_ver, &fd);
 		if (!err)
 			prog->instances.fds[0] = fd;
 		goto out;
@@ -3864,9 +4576,7 @@ bpf_program__load(struct bpf_program *prog,
 		}
 
 		err = load_program(prog, result.new_insn_ptr,
-				   result.new_insn_cnt,
-				   license, kern_version, &fd);
-
+				   result.new_insn_cnt, license, kern_ver, &fd);
 		if (err) {
 			pr_warn("Loading the %dth instance of program '%s' failed\n",
 				i, prog->section_name);
@@ -3910,20 +4620,14 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level)
 	return 0;
 }
 
-static int libbpf_find_attach_btf_id(const char *name,
-				     enum bpf_attach_type attach_type,
-				     __u32 attach_prog_fd);
 static struct bpf_object *
 __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
-		   struct bpf_object_open_opts *opts)
+		   const struct bpf_object_open_opts *opts)
 {
-	const char *pin_root_path;
+	const char *obj_name, *kconfig;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
-	const char *obj_name;
 	char tmp_name[64];
-	bool relaxed_maps;
-	__u32 attach_prog_fd;
 	int err;
 
 	if (elf_version(EV_CURRENT) == EV_NONE) {
@@ -3952,16 +4656,23 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 		return obj;
 
 	obj->relaxed_core_relocs = OPTS_GET(opts, relaxed_core_relocs, false);
-	relaxed_maps = OPTS_GET(opts, relaxed_maps, false);
-	pin_root_path = OPTS_GET(opts, pin_root_path, NULL);
-	attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0);
-
-	CHECK_ERR(bpf_object__elf_init(obj), err, out);
-	CHECK_ERR(bpf_object__check_endianness(obj), err, out);
-	CHECK_ERR(bpf_object__probe_caps(obj), err, out);
-	CHECK_ERR(bpf_object__elf_collect(obj, relaxed_maps, pin_root_path),
-		  err, out);
-	CHECK_ERR(bpf_object__collect_reloc(obj), err, out);
+	kconfig = OPTS_GET(opts, kconfig, NULL);
+	if (kconfig) {
+		obj->kconfig = strdup(kconfig);
+		if (!obj->kconfig)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	err = bpf_object__elf_init(obj);
+	err = err ? : bpf_object__check_endianness(obj);
+	err = err ? : bpf_object__elf_collect(obj);
+	err = err ? : bpf_object__collect_externs(obj);
+	err = err ? : bpf_object__finalize_btf(obj);
+	err = err ? : bpf_object__init_maps(obj, opts);
+	err = err ? : bpf_object__init_prog_names(obj);
+	err = err ? : bpf_object__collect_reloc(obj);
+	if (err)
+		goto out;
 	bpf_object__elf_finish(obj);
 
 	bpf_object__for_each_program(prog, obj) {
@@ -3978,15 +4689,8 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 
 		bpf_program__set_type(prog, prog_type);
 		bpf_program__set_expected_attach_type(prog, attach_type);
-		if (prog_type == BPF_PROG_TYPE_TRACING) {
-			err = libbpf_find_attach_btf_id(prog->section_name,
-							attach_type,
-							attach_prog_fd);
-			if (err <= 0)
-				goto out;
-			prog->attach_btf_id = err;
-			prog->attach_prog_fd = attach_prog_fd;
-		}
+		if (prog_type == BPF_PROG_TYPE_TRACING)
+			prog->attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0);
 	}
 
 	return obj;
@@ -4026,7 +4730,7 @@ struct bpf_object *bpf_object__open(const char *path)
 }
 
 struct bpf_object *
-bpf_object__open_file(const char *path, struct bpf_object_open_opts *opts)
+bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts)
 {
 	if (!path)
 		return ERR_PTR(-EINVAL);
@@ -4038,7 +4742,7 @@ bpf_object__open_file(const char *path, struct bpf_object_open_opts *opts)
 
 struct bpf_object *
 bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
-		     struct bpf_object_open_opts *opts)
+		     const struct bpf_object_open_opts *opts)
 {
 	if (!obj_buf || obj_buf_sz == 0)
 		return ERR_PTR(-EINVAL);
@@ -4079,6 +4783,92 @@ int bpf_object__unload(struct bpf_object *obj)
 	return 0;
 }
 
+static int bpf_object__sanitize_maps(struct bpf_object *obj)
+{
+	struct bpf_map *m;
+
+	bpf_object__for_each_map(m, obj) {
+		if (!bpf_map__is_internal(m))
+			continue;
+		if (!obj->caps.global_data) {
+			pr_warn("kernel doesn't support global data\n");
+			return -ENOTSUP;
+		}
+		if (!obj->caps.array_mmap)
+			m->def.map_flags ^= BPF_F_MMAPABLE;
+	}
+
+	return 0;
+}
+
+static int bpf_object__resolve_externs(struct bpf_object *obj,
+				       const char *extra_kconfig)
+{
+	bool need_config = false;
+	struct extern_desc *ext;
+	int err, i;
+	void *data;
+
+	if (obj->nr_extern == 0)
+		return 0;
+
+	data = obj->maps[obj->kconfig_map_idx].mmaped;
+
+	for (i = 0; i < obj->nr_extern; i++) {
+		ext = &obj->externs[i];
+
+		if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) {
+			void *ext_val = data + ext->data_off;
+			__u32 kver = get_kernel_version();
+
+			if (!kver) {
+				pr_warn("failed to get kernel version\n");
+				return -EINVAL;
+			}
+			err = set_ext_value_num(ext, ext_val, kver);
+			if (err)
+				return err;
+			pr_debug("extern %s=0x%x\n", ext->name, kver);
+		} else if (strncmp(ext->name, "CONFIG_", 7) == 0) {
+			need_config = true;
+		} else {
+			pr_warn("unrecognized extern '%s'\n", ext->name);
+			return -EINVAL;
+		}
+	}
+	if (need_config && extra_kconfig) {
+		err = bpf_object__read_kconfig_mem(obj, extra_kconfig, data);
+		if (err)
+			return -EINVAL;
+		need_config = false;
+		for (i = 0; i < obj->nr_extern; i++) {
+			ext = &obj->externs[i];
+			if (!ext->is_set) {
+				need_config = true;
+				break;
+			}
+		}
+	}
+	if (need_config) {
+		err = bpf_object__read_kconfig_file(obj, data);
+		if (err)
+			return -EINVAL;
+	}
+	for (i = 0; i < obj->nr_extern; i++) {
+		ext = &obj->externs[i];
+
+		if (!ext->is_set && !ext->is_weak) {
+			pr_warn("extern %s (strong) not resolved\n", ext->name);
+			return -ESRCH;
+		} else if (!ext->is_set) {
+			pr_debug("extern %s (weak) not resolved, defaulting to zero\n",
+				 ext->name);
+		}
+	}
+
+	return 0;
+}
+
 int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 {
 	struct bpf_object *obj;
@@ -4097,9 +4887,15 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 
 	obj->loaded = true;
 
-	CHECK_ERR(bpf_object__create_maps(obj), err, out);
-	CHECK_ERR(bpf_object__relocate(obj, attr->target_btf_path), err, out);
-	CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
+	err = bpf_object__probe_caps(obj);
+	err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
+	err = err ? : bpf_object__sanitize_and_load_btf(obj);
+	err = err ? : bpf_object__sanitize_maps(obj);
+	err = err ? : bpf_object__create_maps(obj);
+	err = err ? : bpf_object__relocate(obj, attr->target_btf_path);
+	err = err ? : bpf_object__load_progs(obj, attr->log_level);
+	if (err)
+		goto out;
 
 	return 0;
 out:
@@ -4670,17 +5466,26 @@ void bpf_object__close(struct bpf_object *obj)
 	btf_ext__free(obj->btf_ext);
 
 	for (i = 0; i < obj->nr_maps; i++) {
-		zfree(&obj->maps[i].name);
-		zfree(&obj->maps[i].pin_path);
-		if (obj->maps[i].clear_priv)
-			obj->maps[i].clear_priv(&obj->maps[i],
-						obj->maps[i].priv);
-		obj->maps[i].priv = NULL;
-		obj->maps[i].clear_priv = NULL;
+		struct bpf_map *map = &obj->maps[i];
+
+		if (map->clear_priv)
+			map->clear_priv(map, map->priv);
+		map->priv = NULL;
+		map->clear_priv = NULL;
+
+		if (map->mmaped) {
+			munmap(map->mmaped, bpf_map_mmap_sz(map));
+			map->mmaped = NULL;
+		}
+
+		zfree(&map->name);
+		zfree(&map->pin_path);
 	}
 
-	zfree(&obj->sections.rodata);
-	zfree(&obj->sections.data);
+	zfree(&obj->kconfig);
+	zfree(&obj->externs);
+	obj->nr_extern = 0;
+
 	zfree(&obj->maps);
 	obj->nr_maps = 0;
 
@@ -4820,6 +5625,11 @@ void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex)
 	prog->prog_ifindex = ifindex;
 }
 
+const char *bpf_program__name(const struct bpf_program *prog)
+{
+	return prog->name;
+}
+
 const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy)
 {
 	const char *title;
@@ -4972,7 +5782,28 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
  */
 #define BPF_APROG_COMPAT(string, ptype) BPF_PROG_SEC(string, ptype)
 
-static const struct {
+#define SEC_DEF(sec_pfx, ptype, ...) {					    \
+	.sec = sec_pfx,							    \
+	.len = sizeof(sec_pfx) - 1,					    \
+	.prog_type = BPF_PROG_TYPE_##ptype,				    \
+	__VA_ARGS__							    \
+}
+
+struct bpf_sec_def;
+
+typedef struct bpf_link *(*attach_fn_t)(const struct bpf_sec_def *sec,
+					struct bpf_program *prog);
+
+static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
+				      struct bpf_program *prog);
+static struct bpf_link *attach_tp(const struct bpf_sec_def *sec,
+				  struct bpf_program *prog);
+static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec,
+				      struct bpf_program *prog);
+static struct bpf_link *attach_trace(const struct bpf_sec_def *sec,
+				     struct bpf_program *prog);
+
+struct bpf_sec_def {
 	const char *sec;
 	size_t len;
 	enum bpf_prog_type prog_type;
@@ -4980,24 +5811,40 @@ static const struct {
 	bool is_attachable;
 	bool is_attach_btf;
 	enum bpf_attach_type attach_type;
-} section_names[] = {
+	attach_fn_t attach_fn;
+};
+
+static const struct bpf_sec_def section_defs[] = {
 	BPF_PROG_SEC("socket",			BPF_PROG_TYPE_SOCKET_FILTER),
-	BPF_PROG_SEC("kprobe/",			BPF_PROG_TYPE_KPROBE),
+	BPF_PROG_SEC("sk_reuseport",		BPF_PROG_TYPE_SK_REUSEPORT),
+	SEC_DEF("kprobe/", KPROBE,
+		.attach_fn = attach_kprobe),
 	BPF_PROG_SEC("uprobe/",			BPF_PROG_TYPE_KPROBE),
-	BPF_PROG_SEC("kretprobe/",		BPF_PROG_TYPE_KPROBE),
+	SEC_DEF("kretprobe/", KPROBE,
+		.attach_fn = attach_kprobe),
 	BPF_PROG_SEC("uretprobe/",		BPF_PROG_TYPE_KPROBE),
 	BPF_PROG_SEC("classifier",		BPF_PROG_TYPE_SCHED_CLS),
 	BPF_PROG_SEC("action",			BPF_PROG_TYPE_SCHED_ACT),
-	BPF_PROG_SEC("tracepoint/",		BPF_PROG_TYPE_TRACEPOINT),
-	BPF_PROG_SEC("tp/",			BPF_PROG_TYPE_TRACEPOINT),
-	BPF_PROG_SEC("raw_tracepoint/",		BPF_PROG_TYPE_RAW_TRACEPOINT),
-	BPF_PROG_SEC("raw_tp/",			BPF_PROG_TYPE_RAW_TRACEPOINT),
-	BPF_PROG_BTF("tp_btf/",			BPF_PROG_TYPE_TRACING,
-						BPF_TRACE_RAW_TP),
-	BPF_PROG_BTF("fentry/",			BPF_PROG_TYPE_TRACING,
-						BPF_TRACE_FENTRY),
-	BPF_PROG_BTF("fexit/",			BPF_PROG_TYPE_TRACING,
-						BPF_TRACE_FEXIT),
+	SEC_DEF("tracepoint/", TRACEPOINT,
+		.attach_fn = attach_tp),
+	SEC_DEF("tp/", TRACEPOINT,
+		.attach_fn = attach_tp),
+	SEC_DEF("raw_tracepoint/", RAW_TRACEPOINT,
+		.attach_fn = attach_raw_tp),
+	SEC_DEF("raw_tp/", RAW_TRACEPOINT,
+		.attach_fn = attach_raw_tp),
+	SEC_DEF("tp_btf/", TRACING,
+		.expected_attach_type = BPF_TRACE_RAW_TP,
+		.is_attach_btf = true,
+		.attach_fn = attach_trace),
+	SEC_DEF("fentry/", TRACING,
+		.expected_attach_type = BPF_TRACE_FENTRY,
+		.is_attach_btf = true,
+		.attach_fn = attach_trace),
+	SEC_DEF("fexit/", TRACING,
+		.expected_attach_type = BPF_TRACE_FEXIT,
+		.is_attach_btf = true,
+		.attach_fn = attach_trace),
 	BPF_PROG_SEC("xdp",			BPF_PROG_TYPE_XDP),
 	BPF_PROG_SEC("perf_event",		BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("lwt_in",			BPF_PROG_TYPE_LWT_IN),
@@ -5059,12 +5906,26 @@ static const struct {
 #undef BPF_APROG_SEC
 #undef BPF_EAPROG_SEC
 #undef BPF_APROG_COMPAT
+#undef SEC_DEF
 
 #define MAX_TYPE_NAME_SIZE 32
 
+static const struct bpf_sec_def *find_sec_def(const char *sec_name)
+{
+	int i, n = ARRAY_SIZE(section_defs);
+
+	for (i = 0; i < n; i++) {
+		if (strncmp(sec_name,
+			    section_defs[i].sec, section_defs[i].len))
+			continue;
+		return &section_defs[i];
+	}
+	return NULL;
+}
+
 static char *libbpf_get_type_names(bool attach_type)
 {
-	int i, len = ARRAY_SIZE(section_names) * MAX_TYPE_NAME_SIZE;
+	int i, len = ARRAY_SIZE(section_defs) * MAX_TYPE_NAME_SIZE;
 	char *buf;
 
 	buf = malloc(len);
@@ -5073,16 +5934,16 @@ static char *libbpf_get_type_names(bool attach_type)
 
 	buf[0] = '\0';
 	/* Forge string buf with all available names */
-	for (i = 0; i < ARRAY_SIZE(section_names); i++) {
-		if (attach_type && !section_names[i].is_attachable)
+	for (i = 0; i < ARRAY_SIZE(section_defs); i++) {
+		if (attach_type && !section_defs[i].is_attachable)
 			continue;
 
-		if (strlen(buf) + strlen(section_names[i].sec) + 2 > len) {
+		if (strlen(buf) + strlen(section_defs[i].sec) + 2 > len) {
 			free(buf);
 			return NULL;
 		}
 		strcat(buf, " ");
-		strcat(buf, section_names[i].sec);
+		strcat(buf, section_defs[i].sec);
 	}
 
 	return buf;
@@ -5091,23 +5952,23 @@ static char *libbpf_get_type_names(bool attach_type)
 int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 			     enum bpf_attach_type *expected_attach_type)
 {
+	const struct bpf_sec_def *sec_def;
 	char *type_names;
-	int i;
 
 	if (!name)
 		return -EINVAL;
 
-	for (i = 0; i < ARRAY_SIZE(section_names); i++) {
-		if (strncmp(name, section_names[i].sec, section_names[i].len))
-			continue;
-		*prog_type = section_names[i].prog_type;
-		*expected_attach_type = section_names[i].expected_attach_type;
+	sec_def = find_sec_def(name);
+	if (sec_def) {
+		*prog_type = sec_def->prog_type;
+		*expected_attach_type = sec_def->expected_attach_type;
 		return 0;
 	}
-	pr_warn("failed to guess program type from ELF section '%s'\n", name);
+
+	pr_debug("failed to guess program type from ELF section '%s'\n", name);
 	type_names = libbpf_get_type_names(false);
 	if (type_names != NULL) {
-		pr_info("supported section(type) names are:%s\n", type_names);
+		pr_debug("supported section(type) names are:%s\n", type_names);
 		free(type_names);
 	}
 
@@ -5186,16 +6047,16 @@ static int libbpf_find_attach_btf_id(const char *name,
 	if (!name)
 		return -EINVAL;
 
-	for (i = 0; i < ARRAY_SIZE(section_names); i++) {
-		if (!section_names[i].is_attach_btf)
+	for (i = 0; i < ARRAY_SIZE(section_defs); i++) {
+		if (!section_defs[i].is_attach_btf)
 			continue;
-		if (strncmp(name, section_names[i].sec, section_names[i].len))
+		if (strncmp(name, section_defs[i].sec, section_defs[i].len))
 			continue;
 		if (attach_prog_fd)
-			err = libbpf_find_prog_btf_id(name + section_names[i].len,
+			err = libbpf_find_prog_btf_id(name + section_defs[i].len,
 						      attach_prog_fd);
 		else
-			err = libbpf_find_vmlinux_btf_id(name + section_names[i].len,
+			err = libbpf_find_vmlinux_btf_id(name + section_defs[i].len,
 							 attach_type);
 		if (err <= 0)
 			pr_warn("%s is not found in vmlinux BTF\n", name);
@@ -5214,18 +6075,18 @@ int libbpf_attach_type_by_name(const char *name,
 	if (!name)
 		return -EINVAL;
 
-	for (i = 0; i < ARRAY_SIZE(section_names); i++) {
-		if (strncmp(name, section_names[i].sec, section_names[i].len))
+	for (i = 0; i < ARRAY_SIZE(section_defs); i++) {
+		if (strncmp(name, section_defs[i].sec, section_defs[i].len))
 			continue;
-		if (!section_names[i].is_attachable)
+		if (!section_defs[i].is_attachable)
 			return -EINVAL;
-		*attach_type = section_names[i].attach_type;
+		*attach_type = section_defs[i].attach_type;
 		return 0;
 	}
-	pr_warn("failed to guess attach type based on ELF section name '%s'\n", name);
+	pr_debug("failed to guess attach type based on ELF section name '%s'\n", name);
 	type_names = libbpf_get_type_names(true);
 	if (type_names != NULL) {
-		pr_info("attachable section(type) names are:%s\n", type_names);
+		pr_debug("attachable section(type) names are:%s\n", type_names);
 		free(type_names);
 	}
 
@@ -5466,17 +6327,37 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 }
 
 struct bpf_link {
+	int (*detach)(struct bpf_link *link);
 	int (*destroy)(struct bpf_link *link);
+	bool disconnected;
 };
 
+/* Release "ownership" of underlying BPF resource (typically, BPF program
+ * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected
+ * link, when destructed through bpf_link__destroy() call won't attempt to
+ * detach/unregisted that BPF resource. This is useful in situations where,
+ * say, attached BPF program has to outlive userspace program that attached it
+ * in the system. Depending on type of BPF program, though, there might be
+ * additional steps (like pinning BPF program in BPF FS) necessary to ensure
+ * exit of userspace program doesn't trigger automatic detachment and clean up
+ * inside the kernel.
+ */
+void bpf_link__disconnect(struct bpf_link *link)
+{
+	link->disconnected = true;
+}
+
 int bpf_link__destroy(struct bpf_link *link)
 {
-	int err;
+	int err = 0;
 
 	if (!link)
 		return 0;
 
-	err = link->destroy(link);
+	if (!link->disconnected && link->detach)
+		err = link->detach(link);
+	if (link->destroy)
+		link->destroy(link);
 	free(link);
 
 	return err;
@@ -5487,7 +6368,7 @@ struct bpf_link_fd {
 	int fd; /* hook FD */
 };
 
-static int bpf_link__destroy_perf_event(struct bpf_link *link)
+static int bpf_link__detach_perf_event(struct bpf_link *link)
 {
 	struct bpf_link_fd *l = (void *)link;
 	int err;
@@ -5519,10 +6400,10 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog,
 		return ERR_PTR(-EINVAL);
 	}
 
-	link = malloc(sizeof(*link));
+	link = calloc(1, sizeof(*link));
 	if (!link)
 		return ERR_PTR(-ENOMEM);
-	link->link.destroy = &bpf_link__destroy_perf_event;
+	link->link.detach = &bpf_link__detach_perf_event;
 	link->fd = pfd;
 
 	if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) {
@@ -5679,6 +6560,18 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
 	return link;
 }
 
+static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
+				      struct bpf_program *prog)
+{
+	const char *func_name;
+	bool retprobe;
+
+	func_name = bpf_program__title(prog, false) + sec->len;
+	retprobe = strcmp(sec->sec, "kretprobe/") == 0;
+
+	return bpf_program__attach_kprobe(prog, retprobe, func_name);
+}
+
 struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,
 					    bool retprobe, pid_t pid,
 					    const char *binary_path,
@@ -5791,7 +6684,33 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog,
 	return link;
 }
 
-static int bpf_link__destroy_fd(struct bpf_link *link)
+static struct bpf_link *attach_tp(const struct bpf_sec_def *sec,
+				  struct bpf_program *prog)
+{
+	char *sec_name, *tp_cat, *tp_name;
+	struct bpf_link *link;
+
+	sec_name = strdup(bpf_program__title(prog, false));
+	if (!sec_name)
+		return ERR_PTR(-ENOMEM);
+
+	/* extract "tp/<category>/<name>" */
+	tp_cat = sec_name + sec->len;
+	tp_name = strchr(tp_cat, '/');
+	if (!tp_name) {
+		link = ERR_PTR(-EINVAL);
+		goto out;
+	}
+	*tp_name = '\0';
+	tp_name++;
+
+	link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name);
+out:
+	free(sec_name);
+	return link;
+}
+
+static int bpf_link__detach_fd(struct bpf_link *link)
 {
 	struct bpf_link_fd *l = (void *)link;
 
@@ -5812,10 +6731,10 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog,
 		return ERR_PTR(-EINVAL);
 	}
 
-	link = malloc(sizeof(*link));
+	link = calloc(1, sizeof(*link));
 	if (!link)
 		return ERR_PTR(-ENOMEM);
-	link->link.destroy = &bpf_link__destroy_fd;
+	link->link.detach = &bpf_link__detach_fd;
 
 	pfd = bpf_raw_tracepoint_open(tp_name, prog_fd);
 	if (pfd < 0) {
@@ -5830,6 +6749,14 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog,
 	return (struct bpf_link *)link;
 }
 
+static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec,
+				      struct bpf_program *prog)
+{
+	const char *tp_name = bpf_program__title(prog, false) + sec->len;
+
+	return bpf_program__attach_raw_tracepoint(prog, tp_name);
+}
+
 struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog)
 {
 	char errmsg[STRERR_BUFSIZE];
@@ -5843,10 +6770,10 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog)
 		return ERR_PTR(-EINVAL);
 	}
 
-	link = malloc(sizeof(*link));
+	link = calloc(1, sizeof(*link));
 	if (!link)
 		return ERR_PTR(-ENOMEM);
-	link->link.destroy = &bpf_link__destroy_fd;
+	link->link.detach = &bpf_link__detach_fd;
 
 	pfd = bpf_raw_tracepoint_open(NULL, prog_fd);
 	if (pfd < 0) {
@@ -5861,6 +6788,23 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog)
 	return (struct bpf_link *)link;
 }
 
+static struct bpf_link *attach_trace(const struct bpf_sec_def *sec,
+				     struct bpf_program *prog)
+{
+	return bpf_program__attach_trace(prog);
+}
+
+struct bpf_link *bpf_program__attach(struct bpf_program *prog)
+{
+	const struct bpf_sec_def *sec_def;
+
+	sec_def = find_sec_def(bpf_program__title(prog, false));
+	if (!sec_def || !sec_def->attach_fn)
+		return ERR_PTR(-ESRCH);
+
+	return sec_def->attach_fn(sec_def, prog);
+}
+
 enum bpf_perf_event_ret
 bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 			   void **copy_mem, size_t *copy_size,
@@ -5944,7 +6888,7 @@ struct perf_buffer {
 	size_t mmap_size;
 	struct perf_cpu_buf **cpu_bufs;
 	struct epoll_event *events;
-	int cpu_cnt;
+	int cpu_cnt; /* number of allocated CPU buffers */
 	int epoll_fd; /* perf event FD */
 	int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */
 };
@@ -6078,11 +7022,13 @@ perf_buffer__new_raw(int map_fd, size_t page_cnt,
 static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
 					      struct perf_buffer_params *p)
 {
+	const char *online_cpus_file = "/sys/devices/system/cpu/online";
 	struct bpf_map_info map = {};
 	char msg[STRERR_BUFSIZE];
 	struct perf_buffer *pb;
+	bool *online = NULL;
 	__u32 map_info_len;
-	int err, i;
+	int err, i, j, n;
 
 	if (page_cnt & (page_cnt - 1)) {
 		pr_warn("page count should be power of two, but is %zu\n",
@@ -6151,20 +7097,32 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
 		goto error;
 	}
 
-	for (i = 0; i < pb->cpu_cnt; i++) {
+	err = parse_cpu_mask_file(online_cpus_file, &online, &n);
+	if (err) {
+		pr_warn("failed to get online CPU mask: %d\n", err);
+		goto error;
+	}
+
+	for (i = 0, j = 0; i < pb->cpu_cnt; i++) {
 		struct perf_cpu_buf *cpu_buf;
 		int cpu, map_key;
 
 		cpu = p->cpu_cnt > 0 ? p->cpus[i] : i;
 		map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i;
 
+		/* in case user didn't explicitly requested particular CPUs to
+		 * be attached to, skip offline/not present CPUs
+		 */
+		if (p->cpu_cnt <= 0 && (cpu >= n || !online[cpu]))
+			continue;
+
 		cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key);
 		if (IS_ERR(cpu_buf)) {
 			err = PTR_ERR(cpu_buf);
 			goto error;
 		}
 
-		pb->cpu_bufs[i] = cpu_buf;
+		pb->cpu_bufs[j] = cpu_buf;
 
 		err = bpf_map_update_elem(pb->map_fd, &map_key,
 					  &cpu_buf->fd, 0);
@@ -6176,21 +7134,25 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
 			goto error;
 		}
 
-		pb->events[i].events = EPOLLIN;
-		pb->events[i].data.ptr = cpu_buf;
+		pb->events[j].events = EPOLLIN;
+		pb->events[j].data.ptr = cpu_buf;
 		if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd,
-			      &pb->events[i]) < 0) {
+			      &pb->events[j]) < 0) {
 			err = -errno;
 			pr_warn("failed to epoll_ctl cpu #%d perf FD %d: %s\n",
 				cpu, cpu_buf->fd,
 				libbpf_strerror_r(err, msg, sizeof(msg)));
 			goto error;
 		}
+		j++;
 	}
+	pb->cpu_cnt = j;
+	free(online);
 
 	return pb;
 
 error:
+	free(online);
 	if (pb)
 		perf_buffer__free(pb);
 	return ERR_PTR(err);
@@ -6521,62 +7483,267 @@ void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear)
 	}
 }
 
-int libbpf_num_possible_cpus(void)
+int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz)
 {
-	static const char *fcpu = "/sys/devices/system/cpu/possible";
-	int len = 0, n = 0, il = 0, ir = 0;
-	unsigned int start = 0, end = 0;
-	int tmp_cpus = 0;
-	static int cpus;
-	char buf[128];
-	int error = 0;
-	int fd = -1;
+	int err = 0, n, len, start, end = -1;
+	bool *tmp;
 
-	tmp_cpus = READ_ONCE(cpus);
-	if (tmp_cpus > 0)
-		return tmp_cpus;
+	*mask = NULL;
+	*mask_sz = 0;
+
+	/* Each sub string separated by ',' has format \d+-\d+ or \d+ */
+	while (*s) {
+		if (*s == ',' || *s == '\n') {
+			s++;
+			continue;
+		}
+		n = sscanf(s, "%d%n-%d%n", &start, &len, &end, &len);
+		if (n <= 0 || n > 2) {
+			pr_warn("Failed to get CPU range %s: %d\n", s, n);
+			err = -EINVAL;
+			goto cleanup;
+		} else if (n == 1) {
+			end = start;
+		}
+		if (start < 0 || start > end) {
+			pr_warn("Invalid CPU range [%d,%d] in %s\n",
+				start, end, s);
+			err = -EINVAL;
+			goto cleanup;
+		}
+		tmp = realloc(*mask, end + 1);
+		if (!tmp) {
+			err = -ENOMEM;
+			goto cleanup;
+		}
+		*mask = tmp;
+		memset(tmp + *mask_sz, 0, start - *mask_sz);
+		memset(tmp + start, 1, end - start + 1);
+		*mask_sz = end + 1;
+		s += len;
+	}
+	if (!*mask_sz) {
+		pr_warn("Empty CPU range\n");
+		return -EINVAL;
+	}
+	return 0;
+cleanup:
+	free(*mask);
+	*mask = NULL;
+	return err;
+}
+
+int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz)
+{
+	int fd, err = 0, len;
+	char buf[128];
 
 	fd = open(fcpu, O_RDONLY);
 	if (fd < 0) {
-		error = errno;
-		pr_warn("Failed to open file %s: %s\n", fcpu, strerror(error));
-		return -error;
+		err = -errno;
+		pr_warn("Failed to open cpu mask file %s: %d\n", fcpu, err);
+		return err;
 	}
 	len = read(fd, buf, sizeof(buf));
 	close(fd);
 	if (len <= 0) {
-		error = len ? errno : EINVAL;
-		pr_warn("Failed to read # of possible cpus from %s: %s\n",
-			fcpu, strerror(error));
-		return -error;
+		err = len ? -errno : -EINVAL;
+		pr_warn("Failed to read cpu mask from %s: %d\n", fcpu, err);
+		return err;
 	}
-	if (len == sizeof(buf)) {
-		pr_warn("File %s size overflow\n", fcpu);
-		return -EOVERFLOW;
+	if (len >= sizeof(buf)) {
+		pr_warn("CPU mask is too big in file %s\n", fcpu);
+		return -E2BIG;
 	}
 	buf[len] = '\0';
 
-	for (ir = 0, tmp_cpus = 0; ir <= len; ir++) {
-		/* Each sub string separated by ',' has format \d+-\d+ or \d+ */
-		if (buf[ir] == ',' || buf[ir] == '\0') {
-			buf[ir] = '\0';
-			n = sscanf(&buf[il], "%u-%u", &start, &end);
-			if (n <= 0) {
-				pr_warn("Failed to get # CPUs from %s\n",
-					&buf[il]);
-				return -EINVAL;
-			} else if (n == 1) {
-				end = start;
-			}
-			tmp_cpus += end - start + 1;
-			il = ir + 1;
-		}
-	}
-	if (tmp_cpus <= 0) {
-		pr_warn("Invalid #CPUs %d from %s\n", tmp_cpus, fcpu);
-		return -EINVAL;
+	return parse_cpu_mask_str(buf, mask, mask_sz);
+}
+
+int libbpf_num_possible_cpus(void)
+{
+	static const char *fcpu = "/sys/devices/system/cpu/possible";
+	static int cpus;
+	int err, n, i, tmp_cpus;
+	bool *mask;
+
+	tmp_cpus = READ_ONCE(cpus);
+	if (tmp_cpus > 0)
+		return tmp_cpus;
+
+	err = parse_cpu_mask_file(fcpu, &mask, &n);
+	if (err)
+		return err;
+
+	tmp_cpus = 0;
+	for (i = 0; i < n; i++) {
+		if (mask[i])
+			tmp_cpus++;
 	}
+	free(mask);
 
 	WRITE_ONCE(cpus, tmp_cpus);
 	return tmp_cpus;
 }
+
+int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
+			      const struct bpf_object_open_opts *opts)
+{
+	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts,
+		.object_name = s->name,
+	);
+	struct bpf_object *obj;
+	int i;
+
+	/* Attempt to preserve opts->object_name, unless overriden by user
+	 * explicitly. Overwriting object name for skeletons is discouraged,
+	 * as it breaks global data maps, because they contain object name
+	 * prefix as their own map name prefix. When skeleton is generated,
+	 * bpftool is making an assumption that this name will stay the same.
+	 */
+	if (opts) {
+		memcpy(&skel_opts, opts, sizeof(*opts));
+		if (!opts->object_name)
+			skel_opts.object_name = s->name;
+	}
+
+	obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts);
+	if (IS_ERR(obj)) {
+		pr_warn("failed to initialize skeleton BPF object '%s': %ld\n",
+			s->name, PTR_ERR(obj));
+		return PTR_ERR(obj);
+	}
+
+	*s->obj = obj;
+
+	for (i = 0; i < s->map_cnt; i++) {
+		struct bpf_map **map = s->maps[i].map;
+		const char *name = s->maps[i].name;
+		void **mmaped = s->maps[i].mmaped;
+
+		*map = bpf_object__find_map_by_name(obj, name);
+		if (!*map) {
+			pr_warn("failed to find skeleton map '%s'\n", name);
+			return -ESRCH;
+		}
+
+		/* externs shouldn't be pre-setup from user code */
+		if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_KCONFIG)
+			*mmaped = (*map)->mmaped;
+	}
+
+	for (i = 0; i < s->prog_cnt; i++) {
+		struct bpf_program **prog = s->progs[i].prog;
+		const char *name = s->progs[i].name;
+
+		*prog = bpf_object__find_program_by_name(obj, name);
+		if (!*prog) {
+			pr_warn("failed to find skeleton program '%s'\n", name);
+			return -ESRCH;
+		}
+	}
+
+	return 0;
+}
+
+int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
+{
+	int i, err;
+
+	err = bpf_object__load(*s->obj);
+	if (err) {
+		pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err);
+		return err;
+	}
+
+	for (i = 0; i < s->map_cnt; i++) {
+		struct bpf_map *map = *s->maps[i].map;
+		size_t mmap_sz = bpf_map_mmap_sz(map);
+		int prot, map_fd = bpf_map__fd(map);
+		void **mmaped = s->maps[i].mmaped;
+
+		if (!mmaped)
+			continue;
+
+		if (!(map->def.map_flags & BPF_F_MMAPABLE)) {
+			*mmaped = NULL;
+			continue;
+		}
+
+		if (map->def.map_flags & BPF_F_RDONLY_PROG)
+			prot = PROT_READ;
+		else
+			prot = PROT_READ | PROT_WRITE;
+
+		/* Remap anonymous mmap()-ed "map initialization image" as
+		 * a BPF map-backed mmap()-ed memory, but preserving the same
+		 * memory address. This will cause kernel to change process'
+		 * page table to point to a different piece of kernel memory,
+		 * but from userspace point of view memory address (and its
+		 * contents, being identical at this point) will stay the
+		 * same. This mapping will be released by bpf_object__close()
+		 * as per normal clean up procedure, so we don't need to worry
+		 * about it from skeleton's clean up perspective.
+		 */
+		*mmaped = mmap(map->mmaped, mmap_sz, prot,
+				MAP_SHARED | MAP_FIXED, map_fd, 0);
+		if (*mmaped == MAP_FAILED) {
+			err = -errno;
+			*mmaped = NULL;
+			pr_warn("failed to re-mmap() map '%s': %d\n",
+				 bpf_map__name(map), err);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+int bpf_object__attach_skeleton(struct bpf_object_skeleton *s)
+{
+	int i;
+
+	for (i = 0; i < s->prog_cnt; i++) {
+		struct bpf_program *prog = *s->progs[i].prog;
+		struct bpf_link **link = s->progs[i].link;
+		const struct bpf_sec_def *sec_def;
+		const char *sec_name = bpf_program__title(prog, false);
+
+		sec_def = find_sec_def(sec_name);
+		if (!sec_def || !sec_def->attach_fn)
+			continue;
+
+		*link = sec_def->attach_fn(sec_def, prog);
+		if (IS_ERR(*link)) {
+			pr_warn("failed to auto-attach program '%s': %ld\n",
+				bpf_program__name(prog), PTR_ERR(*link));
+			return PTR_ERR(*link);
+		}
+	}
+
+	return 0;
+}
+
+void bpf_object__detach_skeleton(struct bpf_object_skeleton *s)
+{
+	int i;
+
+	for (i = 0; i < s->prog_cnt; i++) {
+		struct bpf_link **link = s->progs[i].link;
+
+		if (!IS_ERR_OR_NULL(*link))
+			bpf_link__destroy(*link);
+		*link = NULL;
+	}
+}
+
+void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s)
+{
+	if (s->progs)
+		bpf_object__detach_skeleton(s);
+	if (s->obj)
+		bpf_object__close(*s->obj);
+	free(s->maps);
+	free(s->progs);
+	free(s);
+}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 0dbf4bfba0c4..fe592ef48f1b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -17,14 +17,12 @@
 #include <sys/types.h>  // for size_t
 #include <linux/bpf.h>
 
+#include "libbpf_common.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#ifndef LIBBPF_API
-#define LIBBPF_API __attribute__((visibility("default")))
-#endif
-
 enum libbpf_errno {
 	__LIBBPF_ERRNO__START = 4000,
 
@@ -67,28 +65,6 @@ struct bpf_object_open_attr {
 	enum bpf_prog_type prog_type;
 };
 
-/* Helper macro to declare and initialize libbpf options struct
- *
- * This dance with uninitialized declaration, followed by memset to zero,
- * followed by assignment using compound literal syntax is done to preserve
- * ability to use a nice struct field initialization syntax and **hopefully**
- * have all the padding bytes initialized to zero. It's not guaranteed though,
- * when copying literal, that compiler won't copy garbage in literal's padding
- * bytes, but that's the best way I've found and it seems to work in practice.
- *
- * Macro declares opts struct of given type and name, zero-initializes,
- * including any extra padding, it with memset() and then assigns initial
- * values provided by users in struct initializer-syntax as varargs.
- */
-#define DECLARE_LIBBPF_OPTS(TYPE, NAME, ...)				    \
-	struct TYPE NAME = ({ 						    \
-		memset(&NAME, 0, sizeof(struct TYPE));			    \
-		(struct TYPE) {						    \
-			.sz = sizeof(struct TYPE),			    \
-			__VA_ARGS__					    \
-		};							    \
-	})
-
 struct bpf_object_open_opts {
 	/* size of this struct, for forward/backward compatiblity */
 	size_t sz;
@@ -109,15 +85,19 @@ struct bpf_object_open_opts {
 	 */
 	const char *pin_root_path;
 	__u32 attach_prog_fd;
+	/* Additional kernel config content that augments and overrides
+	 * system Kconfig for CONFIG_xxx externs.
+	 */
+	const char *kconfig;
 };
-#define bpf_object_open_opts__last_field attach_prog_fd
+#define bpf_object_open_opts__last_field kconfig
 
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 LIBBPF_API struct bpf_object *
-bpf_object__open_file(const char *path, struct bpf_object_open_opts *opts);
+bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts);
 LIBBPF_API struct bpf_object *
 bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
-		     struct bpf_object_open_opts *opts);
+		     const struct bpf_object_open_opts *opts);
 
 /* deprecated bpf_object__open variants */
 LIBBPF_API struct bpf_object *
@@ -126,11 +106,6 @@ bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz,
 LIBBPF_API struct bpf_object *
 bpf_object__open_xattr(struct bpf_object_open_attr *attr);
 
-int bpf_object__section_size(const struct bpf_object *obj, const char *name,
-			     __u32 *size);
-int bpf_object__variable_offset(const struct bpf_object *obj, const char *name,
-				__u32 *off);
-
 enum libbpf_pin_type {
 	LIBBPF_PIN_NONE,
 	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
@@ -161,6 +136,7 @@ struct bpf_object_load_attr {
 LIBBPF_API int bpf_object__load(struct bpf_object *obj);
 LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr);
 LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
+
 LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj);
 LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj);
 
@@ -171,6 +147,9 @@ LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj);
 LIBBPF_API struct bpf_program *
 bpf_object__find_program_by_title(const struct bpf_object *obj,
 				  const char *title);
+LIBBPF_API struct bpf_program *
+bpf_object__find_program_by_name(const struct bpf_object *obj,
+				 const char *name);
 
 LIBBPF_API struct bpf_object *bpf_object__next(struct bpf_object *prev);
 #define bpf_object__for_each_safe(pos, tmp)			\
@@ -214,6 +193,7 @@ LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog);
 LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog,
 					 __u32 ifindex);
 
+LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog);
 LIBBPF_API const char *bpf_program__title(const struct bpf_program *prog,
 					  bool needs_copy);
 
@@ -235,9 +215,12 @@ LIBBPF_API void bpf_program__unload(struct bpf_program *prog);
 
 struct bpf_link;
 
+LIBBPF_API void bpf_link__disconnect(struct bpf_link *link);
 LIBBPF_API int bpf_link__destroy(struct bpf_link *link);
 
 LIBBPF_API struct bpf_link *
+bpf_program__attach(struct bpf_program *prog);
+LIBBPF_API struct bpf_link *
 bpf_program__attach_perf_event(struct bpf_program *prog, int pfd);
 LIBBPF_API struct bpf_link *
 bpf_program__attach_kprobe(struct bpf_program *prog, bool retprobe,
@@ -512,18 +495,6 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 			   void **copy_mem, size_t *copy_size,
 			   bpf_perf_event_print_t fn, void *private_data);
 
-struct nlattr;
-typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
-int libbpf_netlink_open(unsigned int *nl_pid);
-int libbpf_nl_get_link(int sock, unsigned int nl_pid,
-		       libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie);
-int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex,
-			libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie);
-int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
-			libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
-int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
-			 libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie);
-
 struct bpf_prog_linfo;
 struct bpf_prog_info;
 
@@ -630,6 +601,50 @@ bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear);
  */
 LIBBPF_API int libbpf_num_possible_cpus(void);
 
+struct bpf_map_skeleton {
+	const char *name;
+	struct bpf_map **map;
+	void **mmaped;
+};
+
+struct bpf_prog_skeleton {
+	const char *name;
+	struct bpf_program **prog;
+	struct bpf_link **link;
+};
+
+struct bpf_object_skeleton {
+	size_t sz; /* size of this struct, for forward/backward compatibility */
+
+	const char *name;
+	void *data;
+	size_t data_sz;
+
+	struct bpf_object **obj;
+
+	int map_cnt;
+	int map_skel_sz; /* sizeof(struct bpf_skeleton_map) */
+	struct bpf_map_skeleton *maps;
+
+	int prog_cnt;
+	int prog_skel_sz; /* sizeof(struct bpf_skeleton_prog) */
+	struct bpf_prog_skeleton *progs;
+};
+
+LIBBPF_API int
+bpf_object__open_skeleton(struct bpf_object_skeleton *s,
+			  const struct bpf_object_open_opts *opts);
+LIBBPF_API int bpf_object__load_skeleton(struct bpf_object_skeleton *s);
+LIBBPF_API int bpf_object__attach_skeleton(struct bpf_object_skeleton *s);
+LIBBPF_API void bpf_object__detach_skeleton(struct bpf_object_skeleton *s);
+LIBBPF_API void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s);
+
+enum libbpf_tristate {
+	TRI_NO = 0,
+	TRI_YES = 1,
+	TRI_MODULE = 2,
+};
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 8ddc2c40e482..e9713a574243 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -208,3 +208,19 @@ LIBBPF_0.0.6 {
 		btf__find_by_name_kind;
 		libbpf_find_vmlinux_btf_id;
 } LIBBPF_0.0.5;
+
+LIBBPF_0.0.7 {
+	global:
+		btf_dump__emit_type_decl;
+		bpf_link__disconnect;
+		bpf_object__find_program_by_name;
+		bpf_object__attach_skeleton;
+		bpf_object__destroy_skeleton;
+		bpf_object__detach_skeleton;
+		bpf_object__load_skeleton;
+		bpf_object__open_skeleton;
+		bpf_prog_attach_xattr;
+		bpf_program__attach;
+		bpf_program__name;
+		btf__align_of;
+} LIBBPF_0.0.6;
diff --git a/tools/lib/bpf/libbpf.pc.template b/tools/lib/bpf/libbpf.pc.template
index ac17fcef2108..b45ed534bdfb 100644
--- a/tools/lib/bpf/libbpf.pc.template
+++ b/tools/lib/bpf/libbpf.pc.template
@@ -8,5 +8,5 @@ Name: libbpf
 Description: BPF library
 Version: @VERSION@
 Libs: -L${libdir} -lbpf
-Requires.private: libelf
+Requires.private: libelf zlib
 Cflags: -I${includedir}
diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h
new file mode 100644
index 000000000000..a23ae1ac27eb
--- /dev/null
+++ b/tools/lib/bpf/libbpf_common.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * Common user-facing libbpf helpers.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+
+#ifndef __LIBBPF_LIBBPF_COMMON_H
+#define __LIBBPF_LIBBPF_COMMON_H
+
+#include <string.h>
+
+#ifndef LIBBPF_API
+#define LIBBPF_API __attribute__((visibility("default")))
+#endif
+
+/* Helper macro to declare and initialize libbpf options struct
+ *
+ * This dance with uninitialized declaration, followed by memset to zero,
+ * followed by assignment using compound literal syntax is done to preserve
+ * ability to use a nice struct field initialization syntax and **hopefully**
+ * have all the padding bytes initialized to zero. It's not guaranteed though,
+ * when copying literal, that compiler won't copy garbage in literal's padding
+ * bytes, but that's the best way I've found and it seems to work in practice.
+ *
+ * Macro declares opts struct of given type and name, zero-initializes,
+ * including any extra padding, it with memset() and then assigns initial
+ * values provided by users in struct initializer-syntax as varargs.
+ */
+#define DECLARE_LIBBPF_OPTS(TYPE, NAME, ...)				    \
+	struct TYPE NAME = ({ 						    \
+		memset(&NAME, 0, sizeof(struct TYPE));			    \
+		(struct TYPE) {						    \
+			.sz = sizeof(struct TYPE),			    \
+			__VA_ARGS__					    \
+		};							    \
+	})
+
+#endif /* __LIBBPF_LIBBPF_COMMON_H */
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 97ac17a64a58..8c3afbd97747 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -76,7 +76,7 @@ static inline bool libbpf_validate_opts(const char *opts,
 
 		for (i = opts_sz; i < user_sz; i++) {
 			if (opts[i]) {
-				pr_warn("%s has non-zero extra bytes",
+				pr_warn("%s has non-zero extra bytes\n",
 					type_name);
 				return false;
 			}
@@ -95,9 +95,28 @@ static inline bool libbpf_validate_opts(const char *opts,
 #define OPTS_GET(opts, field, fallback_value) \
 	(OPTS_HAS(opts, field) ? (opts)->field : fallback_value)
 
+int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
+int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 			 const char *str_sec, size_t str_len);
 
+int bpf_object__section_size(const struct bpf_object *obj, const char *name,
+			     __u32 *size);
+int bpf_object__variable_offset(const struct bpf_object *obj, const char *name,
+				__u32 *off);
+
+struct nlattr;
+typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
+int libbpf_netlink_open(unsigned int *nl_pid);
+int libbpf_nl_get_link(int sock, unsigned int nl_pid,
+		       libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie);
+int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+			libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie);
+int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+			libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
+int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+			 libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie);
+
 struct btf_ext_info {
 	/*
 	 * info points to the individual info section (e.g. func_info and
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 419652458da4..b139b3d75ebb 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -21,7 +21,6 @@ test_lirc_mode2_user
 get_cgroup_id_user
 test_skb_cgroup_id_user
 test_socket_cookie
-test_cgroup_attach
 test_cgroup_storage
 test_select_reuseport
 test_flow_dissector
@@ -38,5 +37,7 @@ test_hashmap
 test_btf_dump
 xdping
 test_cpp
+*.skel.h
 /no_alu32
 /bpf_gcc
+/tools
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e0fe01d9ec33..41691fb067da 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -3,10 +3,12 @@ include ../../../../scripts/Kbuild.include
 include ../../../scripts/Makefile.arch
 
 CURDIR := $(abspath .)
-LIBDIR := $(abspath ../../../lib)
+TOOLSDIR := $(abspath ../../..)
+LIBDIR := $(TOOLSDIR)/lib
 BPFDIR := $(LIBDIR)/bpf
-TOOLSDIR := $(abspath ../../../include)
-APIDIR := $(TOOLSDIR)/uapi
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
 GENDIR := $(abspath ../../../../include/generated)
 GENHDR := $(GENDIR)/autoconf.h
 
@@ -19,18 +21,18 @@ LLC		?= llc
 LLVM_OBJCOPY	?= llvm-objcopy
 BPF_GCC		?= $(shell command -v bpf-gcc;)
 CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR)	\
-	  -I$(GENDIR) -I$(TOOLSDIR) -I$(CURDIR)				\
+	  -I$(GENDIR) -I$(TOOLSINCDIR) -I$(CURDIR)			\
 	  -Dbpf_prog_load=bpf_prog_test_load				\
 	  -Dbpf_load_program=bpf_test_load_program
-LDLIBS += -lcap -lelf -lrt -lpthread
+LDLIBS += -lcap -lelf -lz -lrt -lpthread
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
 	test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \
-	test_cgroup_storage test_select_reuseport \
+	test_cgroup_storage \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
-	test_cgroup_attach test_progs-no_alu32
+	test_progs-no_alu32
 
 # Also test bpf-gcc, if present
 ifneq ($(BPF_GCC),)
@@ -75,6 +77,24 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 
 TEST_CUSTOM_PROGS = urandom_read
 
+# Emit succinct information message describing current building step
+# $1 - generic step name (e.g., CC, LINK, etc);
+# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor];
+# $3 - target (assumed to be file); only file name will be emitted;
+# $4 - optional extra arg, emitted as-is, if provided.
+ifeq ($(V),1)
+msg =
+else
+msg = @$(info $(1)$(if $(2), [$(2)]) $(notdir $(3)))$(if $(4), $(4))
+endif
+
+# override lib.mk's default rules
+OVERRIDE_TARGETS := 1
+override define CLEAN
+	$(call msg,    CLEAN)
+	$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN)
+endef
+
 include ../lib.mk
 
 # Define simple and short `make test_progs`, `make test_sysctl`, etc targets
@@ -87,10 +107,16 @@ $(notdir $(TEST_GEN_PROGS)						\
 	 $(TEST_GEN_PROGS_EXTENDED)					\
 	 $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ;
 
+$(OUTPUT)/%:%.c
+	$(call msg,     BINARY,,$@)
+	$(LINK.c) $^ $(LDLIBS) -o $@
+
 $(OUTPUT)/urandom_read: urandom_read.c
+	$(call msg,     BINARY,,$@)
 	$(CC) -o $@ $< -Wl,--build-id
 
 $(OUTPUT)/test_stub.o: test_stub.c
+	$(call msg,         CC,,$@)
 	$(CC) -c $(CFLAGS) -o $@ $<
 
 BPFOBJ := $(OUTPUT)/libbpf.a
@@ -110,13 +136,18 @@ $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c
 $(OUTPUT)/test_netcnt: cgroup_helpers.c
 $(OUTPUT)/test_sock_fields: cgroup_helpers.c
 $(OUTPUT)/test_sysctl: cgroup_helpers.c
-$(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
 
 .PHONY: force
 
 # force a rebuild of BPFOBJ when its dependencies are updated
 force:
 
+DEFAULT_BPFTOOL := $(OUTPUT)/tools/usr/local/sbin/bpftool
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+$(DEFAULT_BPFTOOL): force
+	$(MAKE) -C $(BPFTOOLDIR) DESTDIR=$(OUTPUT)/tools install
+
 $(BPFOBJ): force
 	$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
 
@@ -159,27 +190,33 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h
 # $3 - CFLAGS
 # $4 - LDFLAGS
 define CLANG_BPF_BUILD_RULE
+	$(call msg,  CLANG-LLC,$(TRUNNER_BINARY),$2)
 	($(CLANG) $3 -O2 -target bpf -emit-llvm				\
 		-c $1 -o - || echo "BPF obj compilation failed") | 	\
 	$(LLC) -mattr=dwarfris -march=bpf -mcpu=probe $4 -filetype=obj -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
 define CLANG_NOALU32_BPF_BUILD_RULE
+	$(call msg,  CLANG-LLC,$(TRUNNER_BINARY),$2)
 	($(CLANG) $3 -O2 -target bpf -emit-llvm				\
 		-c $1 -o - || echo "BPF obj compilation failed") | 	\
 	$(LLC) -march=bpf -mcpu=v2 $4 -filetype=obj -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but using native Clang and bpf LLC
 define CLANG_NATIVE_BPF_BUILD_RULE
+	$(call msg,  CLANG-BPF,$(TRUNNER_BINARY),$2)
 	($(CLANG) $3 -O2 -emit-llvm					\
 		-c $1 -o - || echo "BPF obj compilation failed") | 	\
 	$(LLC) -march=bpf -mcpu=probe $4 -filetype=obj -o $2
 endef
 # Build BPF object using GCC
 define GCC_BPF_BUILD_RULE
+	$(call msg,    GCC-BPF,$(TRUNNER_BINARY),$2)
 	$(BPF_GCC) $3 $4 -O2 -c $1 -o $2
 endef
 
+SKEL_BLACKLIST := btf__% test_pinning_invalid.c
+
 # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on
 # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
 # Parameters:
@@ -195,8 +232,11 @@ TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o,		\
 				 $$(filter %.c,$(TRUNNER_EXTRA_SOURCES)))
 TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES))
 TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h
-TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o,		\
-				$$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)))
+TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c))
+TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS))
+TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h,	\
+				 $$(filter-out $(SKEL_BLACKLIST),	\
+					       $$(TRUNNER_BPF_SRCS)))
 
 # Evaluate rules now with extra TRUNNER_XXX variables above already defined
 $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2))
@@ -226,12 +266,19 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
 	$$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@,			\
 					  $(TRUNNER_BPF_CFLAGS),	\
 					  $(TRUNNER_BPF_LDFLAGS))
+
+$(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h:			\
+		      $(TRUNNER_OUTPUT)/%.o				\
+		      | $(BPFTOOL) $(TRUNNER_OUTPUT)
+	$$(call msg,   GEN-SKEL,$(TRUNNER_BINARY),$$@)
+	$$(BPFTOOL) gen skeleton $$< > $$@
 endif
 
 # ensure we set up tests.h header generation rule just once
 ifeq ($($(TRUNNER_TESTS_DIR)-tests-hdr),)
 $(TRUNNER_TESTS_DIR)-tests-hdr := y
 $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c
+	$$(call msg,   TEST-HDR,$(TRUNNER_BINARY),$$@)
 	$$(shell ( cd $(TRUNNER_TESTS_DIR);				\
 		  echo '/* Generated header, do not edit */';		\
 		  ls *.c 2> /dev/null |					\
@@ -245,7 +292,9 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 		      $(TRUNNER_TESTS_DIR)/%.c				\
 		      $(TRUNNER_EXTRA_HDRS)				\
 		      $(TRUNNER_BPF_OBJS)				\
+		      $(TRUNNER_BPF_SKELS)				\
 		      $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+	$$(call msg,   TEST-OBJ,$(TRUNNER_BINARY),$$@)
 	cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
 
 $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
@@ -253,17 +302,20 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
 		       $(TRUNNER_EXTRA_HDRS)				\
 		       $(TRUNNER_TESTS_HDR)				\
 		       $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+	$$(call msg,  EXTRA-OBJ,$(TRUNNER_BINARY),$$@)
 	$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
 
+# only copy extra resources if in flavored build
 $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT)
 ifneq ($2,)
-	# only copy extra resources if in flavored build
+	$$(call msg,  EXTRAS-CP,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES))
 	cp -a $$^ $(TRUNNER_OUTPUT)/
 endif
 
 $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
 			     $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ)		\
 			     | $(TRUNNER_BINARY)-extras
+	$$(call msg,     BINARY,,$$@)
 	$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@
 
 endef
@@ -315,12 +367,15 @@ verifier/tests.h: verifier/*.c
 		  echo '#endif' \
 		) > verifier/tests.h)
 $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT)
+	$(call msg,     BINARY,,$@)
 	$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
 # Make sure we are able to include and link libbpf against c++.
-$(OUTPUT)/test_cpp: test_cpp.cpp $(BPFOBJ)
+$(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
+	$(call msg,        CXX,,$@)
 	$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@
 
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS)					\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
-	feature $(OUTPUT)/*.o $(OUTPUT)/no_alu32 $(OUTPUT)/bpf_gcc
+	feature $(OUTPUT)/*.o $(OUTPUT)/no_alu32 $(OUTPUT)/bpf_gcc	\
+	tools *.skel.h
diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
index a83111a32d4a..a0ee87c8e1ea 100644
--- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c
+++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
@@ -1,26 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
-
-#define EMBED_FILE(NAME, PATH)						    \
-asm (									    \
-"      .pushsection \".rodata\", \"a\", @progbits              \n"	    \
-"      .global "#NAME"_data                                    \n"	    \
-#NAME"_data:                                                   \n"	    \
-"      .incbin \"" PATH "\"                                    \n"	    \
-#NAME"_data_end:                                               \n"	    \
-"      .global "#NAME"_size                                    \n"	    \
-"      .type "#NAME"_size, @object                             \n"	    \
-"      .size "#NAME"_size, 4                                   \n"	    \
-"      .align 4,                                               \n"	    \
-#NAME"_size:                                                   \n"	    \
-"      .int "#NAME"_data_end - "#NAME"_data                    \n"	    \
-"      .popsection                                             \n"	    \
-);									    \
-extern char NAME##_data[];						    \
-extern int NAME##_size;
+#include "test_attach_probe.skel.h"
 
 ssize_t get_base_addr() {
-	size_t start;
+	size_t start, offset;
 	char buf[256];
 	FILE *f;
 
@@ -28,10 +11,11 @@ ssize_t get_base_addr() {
 	if (!f)
 		return -errno;
 
-	while (fscanf(f, "%zx-%*x %s %*s\n", &start, buf) == 2) {
+	while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n",
+		      &start, buf, &offset) == 3) {
 		if (strcmp(buf, "r-xp") == 0) {
 			fclose(f);
-			return start;
+			return start - offset;
 		}
 	}
 
@@ -39,30 +23,12 @@ ssize_t get_base_addr() {
 	return -EINVAL;
 }
 
-EMBED_FILE(probe, "test_attach_probe.o");
-
 void test_attach_probe(void)
 {
-	const char *kprobe_name = "kprobe/sys_nanosleep";
-	const char *kretprobe_name = "kretprobe/sys_nanosleep";
-	const char *uprobe_name = "uprobe/trigger_func";
-	const char *uretprobe_name = "uretprobe/trigger_func";
-	const int kprobe_idx = 0, kretprobe_idx = 1;
-	const int uprobe_idx = 2, uretprobe_idx = 3;
-	const char *obj_name = "attach_probe";
-	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts,
-		.object_name = obj_name,
-		.relaxed_maps = true,
-	);
-	struct bpf_program *kprobe_prog, *kretprobe_prog;
-	struct bpf_program *uprobe_prog, *uretprobe_prog;
-	struct bpf_object *obj;
-	int err, duration = 0, res;
-	struct bpf_link *kprobe_link = NULL;
-	struct bpf_link *kretprobe_link = NULL;
-	struct bpf_link *uprobe_link = NULL;
-	struct bpf_link *uretprobe_link = NULL;
-	int results_map_fd;
+	int duration = 0;
+	struct bpf_link *kprobe_link, *kretprobe_link;
+	struct bpf_link *uprobe_link, *uretprobe_link;
+	struct test_attach_probe* skel;
 	size_t uprobe_offset;
 	ssize_t base_addr;
 
@@ -72,123 +38,68 @@ void test_attach_probe(void)
 		return;
 	uprobe_offset = (size_t)&get_base_addr - base_addr;
 
-	/* open object */
-	obj = bpf_object__open_mem(probe_data, probe_size, &open_opts);
-	if (CHECK(IS_ERR(obj), "obj_open_mem", "err %ld\n", PTR_ERR(obj)))
+	skel = test_attach_probe__open_and_load();
+	if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
 		return;
-
-	if (CHECK(strcmp(bpf_object__name(obj), obj_name), "obj_name",
-		  "wrong obj name '%s', expected '%s'\n",
-		  bpf_object__name(obj), obj_name))
+	if (CHECK(!skel->bss, "check_bss", ".bss wasn't mmap()-ed\n"))
 		goto cleanup;
 
-	kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name);
-	if (CHECK(!kprobe_prog, "find_probe",
-		  "prog '%s' not found\n", kprobe_name))
-		goto cleanup;
-	kretprobe_prog = bpf_object__find_program_by_title(obj, kretprobe_name);
-	if (CHECK(!kretprobe_prog, "find_probe",
-		  "prog '%s' not found\n", kretprobe_name))
-		goto cleanup;
-	uprobe_prog = bpf_object__find_program_by_title(obj, uprobe_name);
-	if (CHECK(!uprobe_prog, "find_probe",
-		  "prog '%s' not found\n", uprobe_name))
-		goto cleanup;
-	uretprobe_prog = bpf_object__find_program_by_title(obj, uretprobe_name);
-	if (CHECK(!uretprobe_prog, "find_probe",
-		  "prog '%s' not found\n", uretprobe_name))
-		goto cleanup;
-
-	/* create maps && load programs */
-	err = bpf_object__load(obj);
-	if (CHECK(err, "obj_load", "err %d\n", err))
-		goto cleanup;
-
-	/* load maps */
-	results_map_fd = bpf_find_map(__func__, obj, "results_map");
-	if (CHECK(results_map_fd < 0, "find_results_map",
-		  "err %d\n", results_map_fd))
-		goto cleanup;
-
-	kprobe_link = bpf_program__attach_kprobe(kprobe_prog,
+	kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe,
 						 false /* retprobe */,
 						 SYS_NANOSLEEP_KPROBE_NAME);
 	if (CHECK(IS_ERR(kprobe_link), "attach_kprobe",
-		  "err %ld\n", PTR_ERR(kprobe_link))) {
-		kprobe_link = NULL;
+		  "err %ld\n", PTR_ERR(kprobe_link)))
 		goto cleanup;
-	}
-	kretprobe_link = bpf_program__attach_kprobe(kretprobe_prog,
+	skel->links.handle_kprobe = kprobe_link;
+
+	kretprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kretprobe,
 						    true /* retprobe */,
 						    SYS_NANOSLEEP_KPROBE_NAME);
 	if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe",
-		  "err %ld\n", PTR_ERR(kretprobe_link))) {
-		kretprobe_link = NULL;
+		  "err %ld\n", PTR_ERR(kretprobe_link)))
 		goto cleanup;
-	}
-	uprobe_link = bpf_program__attach_uprobe(uprobe_prog,
+	skel->links.handle_kretprobe = kretprobe_link;
+
+	uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe,
 						 false /* retprobe */,
 						 0 /* self pid */,
 						 "/proc/self/exe",
 						 uprobe_offset);
 	if (CHECK(IS_ERR(uprobe_link), "attach_uprobe",
-		  "err %ld\n", PTR_ERR(uprobe_link))) {
-		uprobe_link = NULL;
+		  "err %ld\n", PTR_ERR(uprobe_link)))
 		goto cleanup;
-	}
-	uretprobe_link = bpf_program__attach_uprobe(uretprobe_prog,
+	skel->links.handle_uprobe = uprobe_link;
+
+	uretprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uretprobe,
 						    true /* retprobe */,
 						    -1 /* any pid */,
 						    "/proc/self/exe",
 						    uprobe_offset);
 	if (CHECK(IS_ERR(uretprobe_link), "attach_uretprobe",
-		  "err %ld\n", PTR_ERR(uretprobe_link))) {
-		uretprobe_link = NULL;
+		  "err %ld\n", PTR_ERR(uretprobe_link)))
 		goto cleanup;
-	}
+	skel->links.handle_uretprobe = uretprobe_link;
 
 	/* trigger & validate kprobe && kretprobe */
 	usleep(1);
 
-	err = bpf_map_lookup_elem(results_map_fd, &kprobe_idx, &res);
-	if (CHECK(err, "get_kprobe_res",
-		  "failed to get kprobe res: %d\n", err))
+	if (CHECK(skel->bss->kprobe_res != 1, "check_kprobe_res",
+		  "wrong kprobe res: %d\n", skel->bss->kprobe_res))
 		goto cleanup;
-	if (CHECK(res != kprobe_idx + 1, "check_kprobe_res",
-		  "wrong kprobe res: %d\n", res))
-		goto cleanup;
-
-	err = bpf_map_lookup_elem(results_map_fd, &kretprobe_idx, &res);
-	if (CHECK(err, "get_kretprobe_res",
-		  "failed to get kretprobe res: %d\n", err))
-		goto cleanup;
-	if (CHECK(res != kretprobe_idx + 1, "check_kretprobe_res",
-		  "wrong kretprobe res: %d\n", res))
+	if (CHECK(skel->bss->kretprobe_res != 2, "check_kretprobe_res",
+		  "wrong kretprobe res: %d\n", skel->bss->kretprobe_res))
 		goto cleanup;
 
 	/* trigger & validate uprobe & uretprobe */
 	get_base_addr();
 
-	err = bpf_map_lookup_elem(results_map_fd, &uprobe_idx, &res);
-	if (CHECK(err, "get_uprobe_res",
-		  "failed to get uprobe res: %d\n", err))
-		goto cleanup;
-	if (CHECK(res != uprobe_idx + 1, "check_uprobe_res",
-		  "wrong uprobe res: %d\n", res))
-		goto cleanup;
-
-	err = bpf_map_lookup_elem(results_map_fd, &uretprobe_idx, &res);
-	if (CHECK(err, "get_uretprobe_res",
-		  "failed to get uretprobe res: %d\n", err))
+	if (CHECK(skel->bss->uprobe_res != 3, "check_uprobe_res",
+		  "wrong uprobe res: %d\n", skel->bss->uprobe_res))
 		goto cleanup;
-	if (CHECK(res != uretprobe_idx + 1, "check_uretprobe_res",
-		  "wrong uretprobe res: %d\n", res))
+	if (CHECK(skel->bss->uretprobe_res != 4, "check_uretprobe_res",
+		  "wrong uretprobe res: %d\n", skel->bss->uretprobe_res))
 		goto cleanup;
 
 cleanup:
-	bpf_link__destroy(kprobe_link);
-	bpf_link__destroy(kretprobe_link);
-	bpf_link__destroy(uprobe_link);
-	bpf_link__destroy(uretprobe_link);
-	bpf_object__close(obj);
+	test_attach_probe__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
new file mode 100644
index 000000000000..5b13f2c6c402
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include "cgroup_helpers.h"
+
+#define PING_CMD	"ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(void)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = 1 */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+	return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+			       prog, insns_cnt, "GPL", 0,
+			       bpf_log_buf, BPF_LOG_BUF_SIZE);
+}
+
+void test_cgroup_attach_autodetach(void)
+{
+	__u32 duration = 0, prog_cnt = 4, attach_flags;
+	int allow_prog[2] = {-1};
+	__u32 prog_ids[2] = {0};
+	void *ptr = NULL;
+	int cg = 0, i;
+	int attempts;
+
+	for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
+		allow_prog[i] = prog_load();
+		if (CHECK(allow_prog[i] < 0, "prog_load",
+			  "verifier output:\n%s\n-------\n", bpf_log_buf))
+			goto err;
+	}
+
+	if (CHECK_FAIL(setup_cgroup_environment()))
+		goto err;
+
+	/* create a cgroup, attach two programs and remember their ids */
+	cg = create_and_get_cgroup("/cg_autodetach");
+	if (CHECK_FAIL(cg < 0))
+		goto err;
+
+	if (CHECK_FAIL(join_cgroup("/cg_autodetach")))
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
+		if (CHECK(bpf_prog_attach(allow_prog[i], cg,
+					  BPF_CGROUP_INET_EGRESS,
+					  BPF_F_ALLOW_MULTI),
+			  "prog_attach", "prog[%d], errno=%d\n", i, errno))
+			goto err;
+
+	/* make sure that programs are attached and run some traffic */
+	if (CHECK(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags,
+				 prog_ids, &prog_cnt),
+		  "prog_query", "errno=%d\n", errno))
+		goto err;
+	if (CHECK_FAIL(system(PING_CMD)))
+		goto err;
+
+	/* allocate some memory (4Mb) to pin the original cgroup */
+	ptr = malloc(4 * (1 << 20));
+	if (CHECK_FAIL(!ptr))
+		goto err;
+
+	/* close programs and cgroup fd */
+	for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
+		close(allow_prog[i]);
+		allow_prog[i] = -1;
+	}
+
+	close(cg);
+	cg = 0;
+
+	/* leave the cgroup and remove it. don't detach programs */
+	cleanup_cgroup_environment();
+
+	/* wait for the asynchronous auto-detachment.
+	 * wait for no more than 5 sec and give up.
+	 */
+	for (i = 0; i < ARRAY_SIZE(prog_ids); i++) {
+		for (attempts = 5; attempts >= 0; attempts--) {
+			int fd = bpf_prog_get_fd_by_id(prog_ids[i]);
+
+			if (fd < 0)
+				break;
+
+			/* don't leave the fd open */
+			close(fd);
+
+			if (CHECK_FAIL(!attempts))
+				goto err;
+
+			sleep(1);
+		}
+	}
+
+err:
+	for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
+		if (allow_prog[i] >= 0)
+			close(allow_prog[i]);
+	if (cg)
+		close(cg);
+	free(ptr);
+	cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
new file mode 100644
index 000000000000..2ff21dbce179
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include "cgroup_helpers.h"
+
+#define PING_CMD	"ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int map_fd = -1;
+
+static int prog_load_cnt(int verdict, int val)
+{
+	int cgroup_storage_fd, percpu_cgroup_storage_fd;
+
+	if (map_fd < 0)
+		map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+	if (map_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		return -1;
+	}
+
+	cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE,
+				sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+	if (cgroup_storage_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		return -1;
+	}
+
+	percpu_cgroup_storage_fd = bpf_create_map(
+		BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+		sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+	if (percpu_cgroup_storage_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		return -1;
+	}
+
+	struct bpf_insn prog[] = {
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+		BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+		BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd),
+		BPF_MOV64_IMM(BPF_REG_2, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+		BPF_MOV64_IMM(BPF_REG_1, val),
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0),
+
+		BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd),
+		BPF_MOV64_IMM(BPF_REG_2, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+		BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
+		BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+
+		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+	int ret;
+
+	ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+			       prog, insns_cnt, "GPL", 0,
+			       bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+	close(cgroup_storage_fd);
+	return ret;
+}
+
+void test_cgroup_attach_multi(void)
+{
+	__u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
+	int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
+	DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, attach_opts);
+	int allow_prog[7] = {-1};
+	unsigned long long value;
+	__u32 duration = 0;
+	int i = 0;
+
+	for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
+		allow_prog[i] = prog_load_cnt(1, 1 << i);
+		if (CHECK(allow_prog[i] < 0, "prog_load",
+			  "verifier output:\n%s\n-------\n", bpf_log_buf))
+			goto err;
+	}
+
+	if (CHECK_FAIL(setup_cgroup_environment()))
+		goto err;
+
+	cg1 = create_and_get_cgroup("/cg1");
+	if (CHECK_FAIL(cg1 < 0))
+		goto err;
+	cg2 = create_and_get_cgroup("/cg1/cg2");
+	if (CHECK_FAIL(cg2 < 0))
+		goto err;
+	cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
+	if (CHECK_FAIL(cg3 < 0))
+		goto err;
+	cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
+	if (CHECK_FAIL(cg4 < 0))
+		goto err;
+	cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
+	if (CHECK_FAIL(cg5 < 0))
+		goto err;
+
+	if (CHECK_FAIL(join_cgroup("/cg1/cg2/cg3/cg4/cg5")))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_MULTI),
+		  "prog0_attach_to_cg1_multi", "errno=%d\n", errno))
+		goto err;
+
+	if (CHECK(!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+				   BPF_F_ALLOW_MULTI),
+		  "fail_same_prog_attach_to_cg1", "unexpected success\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_MULTI),
+		  "prog1_attach_to_cg1_multi", "errno=%d\n", errno))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_OVERRIDE),
+		  "prog2_attach_to_cg2_override", "errno=%d\n", errno))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_MULTI),
+		  "prog3_attach_to_cg3_multi", "errno=%d\n", errno))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE),
+		  "prog4_attach_to_cg4_override", "errno=%d\n", errno))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0),
+		  "prog5_attach_to_cg5_none", "errno=%d\n", errno))
+		goto err;
+
+	CHECK_FAIL(system(PING_CMD));
+	CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+	CHECK_FAIL(value != 1 + 2 + 8 + 32);
+
+	/* query the number of effective progs in cg5 */
+	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_QUERY_EFFECTIVE, NULL, NULL, &prog_cnt));
+	CHECK_FAIL(prog_cnt != 4);
+	/* retrieve prog_ids of effective progs in cg5 */
+	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_QUERY_EFFECTIVE, &attach_flags,
+				  prog_ids, &prog_cnt));
+	CHECK_FAIL(prog_cnt != 4);
+	CHECK_FAIL(attach_flags != 0);
+	saved_prog_id = prog_ids[0];
+	/* check enospc handling */
+	prog_ids[0] = 0;
+	prog_cnt = 2;
+	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_QUERY_EFFECTIVE, &attach_flags,
+				  prog_ids, &prog_cnt) != -1);
+	CHECK_FAIL(errno != ENOSPC);
+	CHECK_FAIL(prog_cnt != 4);
+	/* check that prog_ids are returned even when buffer is too small */
+	CHECK_FAIL(prog_ids[0] != saved_prog_id);
+	/* retrieve prog_id of single attached prog in cg5 */
+	prog_ids[0] = 0;
+	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, NULL,
+				  prog_ids, &prog_cnt));
+	CHECK_FAIL(prog_cnt != 1);
+	CHECK_FAIL(prog_ids[0] != saved_prog_id);
+
+	/* detach bottom program and ping again */
+	if (CHECK(bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS),
+		  "prog_detach_from_cg5", "errno=%d\n", errno))
+		goto err;
+
+	value = 0;
+	CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+	CHECK_FAIL(system(PING_CMD));
+	CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+	CHECK_FAIL(value != 1 + 2 + 8 + 16);
+
+	/* test replace */
+
+	attach_opts.flags = BPF_F_ALLOW_OVERRIDE | BPF_F_REPLACE;
+	attach_opts.replace_prog_fd = allow_prog[0];
+	if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+					 BPF_CGROUP_INET_EGRESS, &attach_opts),
+		  "fail_prog_replace_override", "unexpected success\n"))
+		goto err;
+	CHECK_FAIL(errno != EINVAL);
+
+	attach_opts.flags = BPF_F_REPLACE;
+	if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+					 BPF_CGROUP_INET_EGRESS, &attach_opts),
+		  "fail_prog_replace_no_multi", "unexpected success\n"))
+		goto err;
+	CHECK_FAIL(errno != EINVAL);
+
+	attach_opts.flags = BPF_F_ALLOW_MULTI | BPF_F_REPLACE;
+	attach_opts.replace_prog_fd = -1;
+	if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+					 BPF_CGROUP_INET_EGRESS, &attach_opts),
+		  "fail_prog_replace_bad_fd", "unexpected success\n"))
+		goto err;
+	CHECK_FAIL(errno != EBADF);
+
+	/* replacing a program that is not attached to cgroup should fail  */
+	attach_opts.replace_prog_fd = allow_prog[3];
+	if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+					 BPF_CGROUP_INET_EGRESS, &attach_opts),
+		  "fail_prog_replace_no_ent", "unexpected success\n"))
+		goto err;
+	CHECK_FAIL(errno != ENOENT);
+
+	/* replace 1st from the top program */
+	attach_opts.replace_prog_fd = allow_prog[0];
+	if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1,
+					BPF_CGROUP_INET_EGRESS, &attach_opts),
+		  "prog_replace", "errno=%d\n", errno))
+		goto err;
+
+	value = 0;
+	CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+	CHECK_FAIL(system(PING_CMD));
+	CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+	CHECK_FAIL(value != 64 + 2 + 8 + 16);
+
+	/* detach 3rd from bottom program and ping again */
+	if (CHECK(!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS),
+		  "fail_prog_detach_from_cg3", "unexpected success\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS),
+		  "prog3_detach_from_cg3", "errno=%d\n", errno))
+		goto err;
+
+	value = 0;
+	CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+	CHECK_FAIL(system(PING_CMD));
+	CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+	CHECK_FAIL(value != 64 + 2 + 16);
+
+	/* detach 2nd from bottom program and ping again */
+	if (CHECK(bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS),
+		  "prog_detach_from_cg4", "errno=%d\n", errno))
+		goto err;
+
+	value = 0;
+	CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+	CHECK_FAIL(system(PING_CMD));
+	CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+	CHECK_FAIL(value != 64 + 2 + 4);
+
+	prog_cnt = 4;
+	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_QUERY_EFFECTIVE, &attach_flags,
+				  prog_ids, &prog_cnt));
+	CHECK_FAIL(prog_cnt != 3);
+	CHECK_FAIL(attach_flags != 0);
+	CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, NULL,
+				  prog_ids, &prog_cnt));
+	CHECK_FAIL(prog_cnt != 0);
+
+err:
+	for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
+		if (allow_prog[i] >= 0)
+			close(allow_prog[i]);
+	close(cg1);
+	close(cg2);
+	close(cg3);
+	close(cg4);
+	close(cg5);
+	cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
new file mode 100644
index 000000000000..9d8cb48b99de
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include "cgroup_helpers.h"
+
+#define FOO		"/foo"
+#define BAR		"/foo/bar/"
+#define PING_CMD	"ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(int verdict)
+{
+	struct bpf_insn prog[] = {
+		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+	return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+			       prog, insns_cnt, "GPL", 0,
+			       bpf_log_buf, BPF_LOG_BUF_SIZE);
+}
+
+void test_cgroup_attach_override(void)
+{
+	int drop_prog = -1, allow_prog = -1, foo = -1, bar = -1;
+	__u32 duration = 0;
+
+	allow_prog = prog_load(1);
+	if (CHECK(allow_prog < 0, "prog_load_allow",
+		  "verifier output:\n%s\n-------\n", bpf_log_buf))
+		goto err;
+
+	drop_prog = prog_load(0);
+	if (CHECK(drop_prog < 0, "prog_load_drop",
+		  "verifier output:\n%s\n-------\n", bpf_log_buf))
+		goto err;
+
+	foo = test__join_cgroup(FOO);
+	if (CHECK(foo < 0, "cgroup_join_foo", "cgroup setup failed\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_OVERRIDE),
+		  "prog_attach_drop_foo_override",
+		  "attach prog to %s failed, errno=%d\n", FOO, errno))
+		goto err;
+
+	if (CHECK(!system(PING_CMD), "ping_fail",
+		  "ping unexpectedly succeeded\n"))
+		goto err;
+
+	bar = test__join_cgroup(BAR);
+	if (CHECK(bar < 0, "cgroup_join_bar", "cgroup setup failed\n"))
+		goto err;
+
+	if (CHECK(!system(PING_CMD), "ping_fail",
+		  "ping unexpectedly succeeded\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_OVERRIDE),
+		  "prog_attach_allow_bar_override",
+		  "attach prog to %s failed, errno=%d\n", BAR, errno))
+		goto err;
+
+	if (CHECK(system(PING_CMD), "ping_ok", "ping failed\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS),
+		  "prog_detach_bar",
+		  "detach prog from %s failed, errno=%d\n", BAR, errno))
+		goto err;
+
+	if (CHECK(!system(PING_CMD), "ping_fail",
+		  "ping unexpectedly succeeded\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_OVERRIDE),
+		  "prog_attach_allow_bar_override",
+		  "attach prog to %s failed, errno=%d\n", BAR, errno))
+		goto err;
+
+	if (CHECK(bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS),
+		  "prog_detach_foo",
+		  "detach prog from %s failed, errno=%d\n", FOO, errno))
+		goto err;
+
+	if (CHECK(system(PING_CMD), "ping_ok", "ping failed\n"))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+				  BPF_F_ALLOW_OVERRIDE),
+		  "prog_attach_allow_bar_override",
+		  "attach prog to %s failed, errno=%d\n", BAR, errno))
+		goto err;
+
+	if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0),
+		  "fail_prog_attach_allow_bar_none",
+		  "attach prog to %s unexpectedly succeeded\n", BAR))
+		goto err;
+
+	if (CHECK(bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS),
+		  "prog_detach_bar",
+		  "detach prog from %s failed, errno=%d\n", BAR, errno))
+		goto err;
+
+	if (CHECK(!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS),
+		  "fail_prog_detach_foo",
+		  "double detach from %s unexpectedly succeeded\n", FOO))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0),
+		  "prog_attach_allow_foo_none",
+		  "attach prog to %s failed, errno=%d\n", FOO, errno))
+		goto err;
+
+	if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0),
+		  "fail_prog_attach_allow_bar_none",
+		  "attach prog to %s unexpectedly succeeded\n", BAR))
+		goto err;
+
+	if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+				   BPF_F_ALLOW_OVERRIDE),
+		  "fail_prog_attach_allow_bar_override",
+		  "attach prog to %s unexpectedly succeeded\n", BAR))
+		goto err;
+
+	if (CHECK(!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
+				   BPF_F_ALLOW_OVERRIDE),
+		  "fail_prog_attach_allow_foo_override",
+		  "attach prog to %s unexpectedly succeeded\n", FOO))
+		goto err;
+
+	if (CHECK(bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0),
+		  "prog_attach_drop_foo_none",
+		  "attach prog to %s failed, errno=%d\n", FOO, errno))
+		goto err;
+
+err:
+	close(foo);
+	close(bar);
+	close(allow_prog);
+	close(drop_prog);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c
new file mode 100644
index 000000000000..b093787e9448
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <test_progs.h>
+#include <sys/mman.h>
+#include <sys/utsname.h>
+#include <linux/version.h>
+#include "test_core_extern.skel.h"
+
+static uint32_t get_kernel_version(void)
+{
+	uint32_t major, minor, patch;
+	struct utsname info;
+
+	uname(&info);
+	if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3)
+		return 0;
+	return KERNEL_VERSION(major, minor, patch);
+}
+
+#define CFG "CONFIG_BPF_SYSCALL=n\n"
+
+static struct test_case {
+	const char *name;
+	const char *cfg;
+	bool fails;
+	struct test_core_extern__data data;
+} test_cases[] = {
+	{ .name = "default search path", .data = { .bpf_syscall = true } },
+	{
+		.name = "custom values",
+		.cfg = "CONFIG_BPF_SYSCALL=n\n"
+		       "CONFIG_TRISTATE=m\n"
+		       "CONFIG_BOOL=y\n"
+		       "CONFIG_CHAR=100\n"
+		       "CONFIG_USHORT=30000\n"
+		       "CONFIG_INT=123456\n"
+		       "CONFIG_ULONG=0xDEADBEEFC0DE\n"
+		       "CONFIG_STR=\"abracad\"\n"
+		       "CONFIG_MISSING=0",
+		.data = {
+			.bpf_syscall = false,
+			.tristate_val = TRI_MODULE,
+			.bool_val = true,
+			.char_val = 100,
+			.ushort_val = 30000,
+			.int_val = 123456,
+			.ulong_val = 0xDEADBEEFC0DE,
+			.str_val = "abracad",
+		},
+	},
+	/* TRISTATE */
+	{ .name = "tristate (y)", .cfg = CFG"CONFIG_TRISTATE=y\n",
+	  .data = { .tristate_val = TRI_YES } },
+	{ .name = "tristate (n)", .cfg = CFG"CONFIG_TRISTATE=n\n",
+	  .data = { .tristate_val = TRI_NO } },
+	{ .name = "tristate (m)", .cfg = CFG"CONFIG_TRISTATE=m\n",
+	  .data = { .tristate_val = TRI_MODULE } },
+	{ .name = "tristate (int)", .fails = 1, .cfg = CFG"CONFIG_TRISTATE=1" },
+	{ .name = "tristate (bad)", .fails = 1, .cfg = CFG"CONFIG_TRISTATE=M" },
+	/* BOOL */
+	{ .name = "bool (y)", .cfg = CFG"CONFIG_BOOL=y\n",
+	  .data = { .bool_val = true } },
+	{ .name = "bool (n)", .cfg = CFG"CONFIG_BOOL=n\n",
+	  .data = { .bool_val = false } },
+	{ .name = "bool (tristate)", .fails = 1, .cfg = CFG"CONFIG_BOOL=m" },
+	{ .name = "bool (int)", .fails = 1, .cfg = CFG"CONFIG_BOOL=1" },
+	/* CHAR */
+	{ .name = "char (tristate)", .cfg = CFG"CONFIG_CHAR=m\n",
+	  .data = { .char_val = 'm' } },
+	{ .name = "char (bad)", .fails = 1, .cfg = CFG"CONFIG_CHAR=q\n" },
+	{ .name = "char (empty)", .fails = 1, .cfg = CFG"CONFIG_CHAR=\n" },
+	{ .name = "char (str)", .fails = 1, .cfg = CFG"CONFIG_CHAR=\"y\"\n" },
+	/* STRING */
+	{ .name = "str (empty)", .cfg = CFG"CONFIG_STR=\"\"\n",
+	  .data = { .str_val = "\0\0\0\0\0\0\0" } },
+	{ .name = "str (padded)", .cfg = CFG"CONFIG_STR=\"abra\"\n",
+	  .data = { .str_val = "abra\0\0\0" } },
+	{ .name = "str (too long)", .cfg = CFG"CONFIG_STR=\"abracada\"\n",
+	  .data = { .str_val = "abracad" } },
+	{ .name = "str (no value)", .fails = 1, .cfg = CFG"CONFIG_STR=\n" },
+	{ .name = "str (bad value)", .fails = 1, .cfg = CFG"CONFIG_STR=bla\n" },
+	/* INTEGERS */
+	{
+		.name = "integer forms",
+		.cfg = CFG
+		       "CONFIG_CHAR=0xA\n"
+		       "CONFIG_USHORT=0462\n"
+		       "CONFIG_INT=-100\n"
+		       "CONFIG_ULONG=+1000000000000",
+		.data = {
+			.char_val = 0xA,
+			.ushort_val = 0462,
+			.int_val = -100,
+			.ulong_val = 1000000000000,
+		},
+	},
+	{ .name = "int (bad)", .fails = 1, .cfg = CFG"CONFIG_INT=abc" },
+	{ .name = "int (str)", .fails = 1, .cfg = CFG"CONFIG_INT=\"abc\"" },
+	{ .name = "int (empty)", .fails = 1, .cfg = CFG"CONFIG_INT=" },
+	{ .name = "int (mixed)", .fails = 1, .cfg = CFG"CONFIG_INT=123abc" },
+	{ .name = "int (max)", .cfg = CFG"CONFIG_INT=2147483647",
+	  .data = { .int_val = 2147483647 } },
+	{ .name = "int (min)", .cfg = CFG"CONFIG_INT=-2147483648",
+	  .data = { .int_val = -2147483648 } },
+	{ .name = "int (max+1)", .fails = 1, .cfg = CFG"CONFIG_INT=2147483648" },
+	{ .name = "int (min-1)", .fails = 1, .cfg = CFG"CONFIG_INT=-2147483649" },
+	{ .name = "ushort (max)", .cfg = CFG"CONFIG_USHORT=65535",
+	  .data = { .ushort_val = 65535 } },
+	{ .name = "ushort (min)", .cfg = CFG"CONFIG_USHORT=0",
+	  .data = { .ushort_val = 0 } },
+	{ .name = "ushort (max+1)", .fails = 1, .cfg = CFG"CONFIG_USHORT=65536" },
+	{ .name = "ushort (min-1)", .fails = 1, .cfg = CFG"CONFIG_USHORT=-1" },
+	{ .name = "u64 (max)", .cfg = CFG"CONFIG_ULONG=0xffffffffffffffff",
+	  .data = { .ulong_val = 0xffffffffffffffff } },
+	{ .name = "u64 (min)", .cfg = CFG"CONFIG_ULONG=0",
+	  .data = { .ulong_val = 0 } },
+	{ .name = "u64 (max+1)", .fails = 1, .cfg = CFG"CONFIG_ULONG=0x10000000000000000" },
+};
+
+void test_core_extern(void)
+{
+	const uint32_t kern_ver = get_kernel_version();
+	int err, duration = 0, i, j;
+	struct test_core_extern *skel = NULL;
+	uint64_t *got, *exp;
+	int n = sizeof(*skel->data) / sizeof(uint64_t);
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		struct test_case *t = &test_cases[i];
+		DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
+			.kconfig = t->cfg,
+		);
+
+		if (!test__start_subtest(t->name))
+			continue;
+
+		skel = test_core_extern__open_opts(&opts);
+		if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+			goto cleanup;
+		err = test_core_extern__load(skel);
+		if (t->fails) {
+			CHECK(!err, "skel_load",
+			      "shouldn't succeed open/load of skeleton\n");
+			goto cleanup;
+		} else if (CHECK(err, "skel_load",
+				 "failed to open/load skeleton\n")) {
+			goto cleanup;
+		}
+		err = test_core_extern__attach(skel);
+		if (CHECK(err, "attach_raw_tp", "failed attach: %d\n", err))
+			goto cleanup;
+
+		usleep(1);
+
+		t->data.kern_ver = kern_ver;
+		t->data.missing_val = 0xDEADC0DE;
+		got = (uint64_t *)skel->data;
+		exp = (uint64_t *)&t->data;
+		for (j = 0; j < n; j++) {
+			CHECK(got[j] != exp[j], "check_res",
+			      "result #%d: expected %lx, but got %lx\n",
+			       j, exp[j], got[j]);
+		}
+cleanup:
+		test_core_extern__destroy(skel);
+		skel = NULL;
+	}
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 05fe85281ff7..31e177adbdf1 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -74,6 +74,7 @@
 		.b123 = 2,						\
 		.c1c  = 3,						\
 		.d00d = 4,						\
+		.f10c = 0,						\
 	},								\
 	.output_len = sizeof(struct core_reloc_arrays_output)		\
 }
@@ -308,12 +309,15 @@ static struct core_reloc_test_case test_cases[] = {
 	ARRAYS_CASE(arrays),
 	ARRAYS_CASE(arrays___diff_arr_dim),
 	ARRAYS_CASE(arrays___diff_arr_val_sz),
+	ARRAYS_CASE(arrays___equiv_zero_sz_arr),
+	ARRAYS_CASE(arrays___fixed_arr),
 
 	ARRAYS_ERR_CASE(arrays___err_too_small),
 	ARRAYS_ERR_CASE(arrays___err_too_shallow),
 	ARRAYS_ERR_CASE(arrays___err_non_array),
 	ARRAYS_ERR_CASE(arrays___err_wrong_val_type1),
 	ARRAYS_ERR_CASE(arrays___err_wrong_val_type2),
+	ARRAYS_ERR_CASE(arrays___err_bad_zero_sz_arr),
 
 	/* enum/ptr/int handling scenarios */
 	PRIMITIVES_CASE(primitives),
diff --git a/tools/testing/selftests/bpf/prog_tests/cpu_mask.c b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c
new file mode 100644
index 000000000000..1fa1bdbaffa9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include "libbpf_internal.h"
+
+static int duration = 0;
+
+static void validate_mask(int case_nr, const char *exp, bool *mask, int n)
+{
+	int i;
+
+	for (i = 0; exp[i]; i++) {
+		if (exp[i] == '1') {
+			if (CHECK(i + 1 > n, "mask_short",
+				  "case #%d: mask too short, got n=%d, need at least %d\n",
+				  case_nr, n, i + 1))
+				return;
+			CHECK(!mask[i], "cpu_not_set",
+			      "case #%d: mask differs, expected cpu#%d SET\n",
+			      case_nr, i);
+		} else {
+			CHECK(i < n && mask[i], "cpu_set",
+			      "case #%d: mask differs, expected cpu#%d UNSET\n",
+			      case_nr, i);
+		}
+	}
+	CHECK(i < n, "mask_long",
+	      "case #%d: mask too long, got n=%d, expected at most %d\n",
+	      case_nr, n, i);
+}
+
+static struct {
+	const char *cpu_mask;
+	const char *expect;
+	bool fails;
+} test_cases[] = {
+	{ "0\n", "1", false },
+	{ "0,2\n", "101", false },
+	{ "0-2\n", "111", false },
+	{ "0-2,3-4\n", "11111", false },
+	{ "0", "1", false },
+	{ "0-2", "111", false },
+	{ "0,2", "101", false },
+	{ "0,1-3", "1111", false },
+	{ "0,1,2,3", "1111", false },
+	{ "0,2-3,5", "101101", false },
+	{ "3-3", "0001", false },
+	{ "2-4,6,9-10", "00111010011", false },
+	/* failure cases */
+	{ "", "", true },
+	{ "0-", "", true },
+	{ "0 ", "", true },
+	{ "0_1", "", true },
+	{ "1-0", "", true },
+	{ "-1", "", true },
+};
+
+void test_cpu_mask()
+{
+	int i, err, n;
+	bool *mask;
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		mask = NULL;
+		err = parse_cpu_mask_str(test_cases[i].cpu_mask, &mask, &n);
+		if (test_cases[i].fails) {
+			CHECK(!err, "should_fail",
+			      "case #%d: parsing should fail!\n", i + 1);
+		} else {
+			if (CHECK(err, "parse_err",
+				  "case #%d: cpu mask parsing failed: %d\n",
+				  i + 1, err))
+				continue;
+			validate_mask(i + 1, test_cases[i].expect, mask, n);
+		}
+		free(mask);
+	}
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
index 40bcff2cc274..235ac4f67f5b 100644
--- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
@@ -1,90 +1,55 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
+#include "test_pkt_access.skel.h"
+#include "fentry_test.skel.h"
+#include "fexit_test.skel.h"
 
 void test_fentry_fexit(void)
 {
-	struct bpf_prog_load_attr attr_fentry = {
-		.file = "./fentry_test.o",
-	};
-	struct bpf_prog_load_attr attr_fexit = {
-		.file = "./fexit_test.o",
-	};
-
-	struct bpf_object *obj_fentry = NULL, *obj_fexit = NULL, *pkt_obj;
-	struct bpf_map *data_map_fentry, *data_map_fexit;
-	char fentry_name[] = "fentry/bpf_fentry_testX";
-	char fexit_name[] = "fexit/bpf_fentry_testX";
-	int err, pkt_fd, kfree_skb_fd, i;
-	struct bpf_link *link[12] = {};
-	struct bpf_program *prog[12];
-	__u32 duration, retval;
-	const int zero = 0;
-	u64 result[12];
-
-	err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
-			    &pkt_obj, &pkt_fd);
-	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
+	struct test_pkt_access *pkt_skel = NULL;
+	struct fentry_test *fentry_skel = NULL;
+	struct fexit_test *fexit_skel = NULL;
+	__u64 *fentry_res, *fexit_res;
+	__u32 duration = 0, retval;
+	int err, pkt_fd, i;
+
+	pkt_skel = test_pkt_access__open_and_load();
+	if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n"))
 		return;
-	err = bpf_prog_load_xattr(&attr_fentry, &obj_fentry, &kfree_skb_fd);
-	if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno))
+	fentry_skel = fentry_test__open_and_load();
+	if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
 		goto close_prog;
-	err = bpf_prog_load_xattr(&attr_fexit, &obj_fexit, &kfree_skb_fd);
-	if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno))
+	fexit_skel = fexit_test__open_and_load();
+	if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n"))
 		goto close_prog;
 
-	for (i = 0; i < 6; i++) {
-		fentry_name[sizeof(fentry_name) - 2] = '1' + i;
-		prog[i] = bpf_object__find_program_by_title(obj_fentry, fentry_name);
-		if (CHECK(!prog[i], "find_prog", "prog %s not found\n", fentry_name))
-			goto close_prog;
-		link[i] = bpf_program__attach_trace(prog[i]);
-		if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n"))
-			goto close_prog;
-	}
-	data_map_fentry = bpf_object__find_map_by_name(obj_fentry, "fentry_t.bss");
-	if (CHECK(!data_map_fentry, "find_data_map", "data map not found\n"))
+	err = fentry_test__attach(fentry_skel);
+	if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err))
 		goto close_prog;
-
-	for (i = 6; i < 12; i++) {
-		fexit_name[sizeof(fexit_name) - 2] = '1' + i - 6;
-		prog[i] = bpf_object__find_program_by_title(obj_fexit, fexit_name);
-		if (CHECK(!prog[i], "find_prog", "prog %s not found\n", fexit_name))
-			goto close_prog;
-		link[i] = bpf_program__attach_trace(prog[i]);
-		if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n"))
-			goto close_prog;
-	}
-	data_map_fexit = bpf_object__find_map_by_name(obj_fexit, "fexit_te.bss");
-	if (CHECK(!data_map_fexit, "find_data_map", "data map not found\n"))
+	err = fexit_test__attach(fexit_skel);
+	if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
 		goto close_prog;
 
+	pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access);
 	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
 				NULL, NULL, &retval, &duration);
 	CHECK(err || retval, "ipv6",
 	      "err %d errno %d retval %d duration %d\n",
 	      err, errno, retval, duration);
 
-	err = bpf_map_lookup_elem(bpf_map__fd(data_map_fentry), &zero, &result);
-	if (CHECK(err, "get_result",
-		  "failed to get output data: %d\n", err))
-		goto close_prog;
-
-	err = bpf_map_lookup_elem(bpf_map__fd(data_map_fexit), &zero, result + 6);
-	if (CHECK(err, "get_result",
-		  "failed to get output data: %d\n", err))
-		goto close_prog;
-
-	for (i = 0; i < 12; i++)
-		if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n",
-			  i % 6 + 1, result[i]))
-			goto close_prog;
+	fentry_res = (__u64 *)fentry_skel->bss;
+	fexit_res = (__u64 *)fexit_skel->bss;
+	printf("%lld\n", fentry_skel->bss->test1_result);
+	for (i = 0; i < 6; i++) {
+		CHECK(fentry_res[i] != 1, "result",
+		      "fentry_test%d failed err %lld\n", i + 1, fentry_res[i]);
+		CHECK(fexit_res[i] != 1, "result",
+		      "fexit_test%d failed err %lld\n", i + 1, fexit_res[i]);
+	}
 
 close_prog:
-	for (i = 0; i < 12; i++)
-		if (!IS_ERR_OR_NULL(link[i]))
-			bpf_link__destroy(link[i]);
-	bpf_object__close(obj_fentry);
-	bpf_object__close(obj_fexit);
-	bpf_object__close(pkt_obj);
+	test_pkt_access__destroy(pkt_skel);
+	fentry_test__destroy(fentry_skel);
+	fexit_test__destroy(fexit_skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
index 9fb103193878..e1a379f5f7d2 100644
--- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
@@ -1,64 +1,43 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
+#include "test_pkt_access.skel.h"
+#include "fentry_test.skel.h"
 
 void test_fentry_test(void)
 {
-	struct bpf_prog_load_attr attr = {
-		.file = "./fentry_test.o",
-	};
-
-	char prog_name[] = "fentry/bpf_fentry_testX";
-	struct bpf_object *obj = NULL, *pkt_obj;
-	int err, pkt_fd, kfree_skb_fd, i;
-	struct bpf_link *link[6] = {};
-	struct bpf_program *prog[6];
+	struct test_pkt_access *pkt_skel = NULL;
+	struct fentry_test *fentry_skel = NULL;
+	int err, pkt_fd, i;
 	__u32 duration, retval;
-	struct bpf_map *data_map;
-	const int zero = 0;
-	u64 result[6];
+	__u64 *result;
 
-	err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
-			    &pkt_obj, &pkt_fd);
-	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
+	pkt_skel = test_pkt_access__open_and_load();
+	if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n"))
 		return;
-	err = bpf_prog_load_xattr(&attr, &obj, &kfree_skb_fd);
-	if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno))
-		goto close_prog;
+	fentry_skel = fentry_test__open_and_load();
+	if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
+		goto cleanup;
 
-	for (i = 0; i < 6; i++) {
-		prog_name[sizeof(prog_name) - 2] = '1' + i;
-		prog[i] = bpf_object__find_program_by_title(obj, prog_name);
-		if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name))
-			goto close_prog;
-		link[i] = bpf_program__attach_trace(prog[i]);
-		if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n"))
-			goto close_prog;
-	}
-	data_map = bpf_object__find_map_by_name(obj, "fentry_t.bss");
-	if (CHECK(!data_map, "find_data_map", "data map not found\n"))
-		goto close_prog;
+	err = fentry_test__attach(fentry_skel);
+	if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err))
+		goto cleanup;
 
+	pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access);
 	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
 				NULL, NULL, &retval, &duration);
 	CHECK(err || retval, "ipv6",
 	      "err %d errno %d retval %d duration %d\n",
 	      err, errno, retval, duration);
 
-	err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &result);
-	if (CHECK(err, "get_result",
-		  "failed to get output data: %d\n", err))
-		goto close_prog;
-
-	for (i = 0; i < 6; i++)
-		if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n",
-			  i + 1, result[i]))
-			goto close_prog;
+	result = (__u64 *)fentry_skel->bss;
+	for (i = 0; i < 6; i++) {
+		if (CHECK(result[i] != 1, "result",
+			  "fentry_test%d failed err %lld\n", i + 1, result[i]))
+			goto cleanup;
+	}
 
-close_prog:
-	for (i = 0; i < 6; i++)
-		if (!IS_ERR_OR_NULL(link[i]))
-			bpf_link__destroy(link[i]);
-	bpf_object__close(obj);
-	bpf_object__close(pkt_obj);
+cleanup:
+	fentry_test__destroy(fentry_skel);
+	test_pkt_access__destroy(pkt_skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c
index 051a6d48762c..16a814eb4d64 100644
--- a/tools/testing/selftests/bpf/prog_tests/mmap.c
+++ b/tools/testing/selftests/bpf/prog_tests/mmap.c
@@ -1,16 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
 #include <sys/mman.h>
+#include "test_mmap.skel.h"
 
 struct map_data {
 	__u64 val[512 * 4];
 };
 
-struct bss_data {
-	__u64 in_val;
-	__u64 out_val;
-};
-
 static size_t roundup_page(size_t sz)
 {
 	long page_size = sysconf(_SC_PAGE_SIZE);
@@ -19,41 +15,25 @@ static size_t roundup_page(size_t sz)
 
 void test_mmap(void)
 {
-	const char *file = "test_mmap.o";
-	const char *probe_name = "raw_tracepoint/sys_enter";
-	const char *tp_name = "sys_enter";
-	const size_t bss_sz = roundup_page(sizeof(struct bss_data));
+	const size_t bss_sz = roundup_page(sizeof(struct test_mmap__bss));
 	const size_t map_sz = roundup_page(sizeof(struct map_data));
 	const int zero = 0, one = 1, two = 2, far = 1500;
 	const long page_size = sysconf(_SC_PAGE_SIZE);
 	int err, duration = 0, i, data_map_fd;
-	struct bpf_program *prog;
-	struct bpf_object *obj;
-	struct bpf_link *link = NULL;
 	struct bpf_map *data_map, *bss_map;
 	void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2;
-	volatile struct bss_data *bss_data;
-	volatile struct map_data *map_data;
+	struct test_mmap__bss *bss_data;
+	struct map_data *map_data;
+	struct test_mmap *skel;
 	__u64 val = 0;
 
-	obj = bpf_object__open_file("test_mmap.o", NULL);
-	if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n",
-		  file, PTR_ERR(obj)))
+
+	skel = test_mmap__open_and_load();
+	if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n"))
 		return;
-	prog = bpf_object__find_program_by_title(obj, probe_name);
-	if (CHECK(!prog, "find_probe", "prog '%s' not found\n", probe_name))
-		goto cleanup;
-	err = bpf_object__load(obj);
-	if (CHECK(err, "obj_load", "failed to load prog '%s': %d\n",
-		  probe_name, err))
-		goto cleanup;
 
-	bss_map = bpf_object__find_map_by_name(obj, "test_mma.bss");
-	if (CHECK(!bss_map, "find_bss_map", ".bss map not found\n"))
-		goto cleanup;
-	data_map = bpf_object__find_map_by_name(obj, "data_map");
-	if (CHECK(!data_map, "find_data_map", "data_map map not found\n"))
-		goto cleanup;
+	bss_map = skel->maps.bss;
+	data_map = skel->maps.data_map;
 	data_map_fd = bpf_map__fd(data_map);
 
 	bss_mmaped = mmap(NULL, bss_sz, PROT_READ | PROT_WRITE, MAP_SHARED,
@@ -77,13 +57,15 @@ void test_mmap(void)
 
 	CHECK_FAIL(bss_data->in_val);
 	CHECK_FAIL(bss_data->out_val);
+	CHECK_FAIL(skel->bss->in_val);
+	CHECK_FAIL(skel->bss->out_val);
 	CHECK_FAIL(map_data->val[0]);
 	CHECK_FAIL(map_data->val[1]);
 	CHECK_FAIL(map_data->val[2]);
 	CHECK_FAIL(map_data->val[far]);
 
-	link = bpf_program__attach_raw_tracepoint(prog, tp_name);
-	if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+	err = test_mmap__attach(skel);
+	if (CHECK(err, "attach_raw_tp", "err %d\n", err))
 		goto cleanup;
 
 	bss_data->in_val = 123;
@@ -94,6 +76,8 @@ void test_mmap(void)
 
 	CHECK_FAIL(bss_data->in_val != 123);
 	CHECK_FAIL(bss_data->out_val != 123);
+	CHECK_FAIL(skel->bss->in_val != 123);
+	CHECK_FAIL(skel->bss->out_val != 123);
 	CHECK_FAIL(map_data->val[0] != 111);
 	CHECK_FAIL(map_data->val[1] != 222);
 	CHECK_FAIL(map_data->val[2] != 123);
@@ -160,6 +144,8 @@ void test_mmap(void)
 	usleep(1);
 	CHECK_FAIL(bss_data->in_val != 321);
 	CHECK_FAIL(bss_data->out_val != 321);
+	CHECK_FAIL(skel->bss->in_val != 321);
+	CHECK_FAIL(skel->bss->out_val != 321);
 	CHECK_FAIL(map_data->val[0] != 111);
 	CHECK_FAIL(map_data->val[1] != 222);
 	CHECK_FAIL(map_data->val[2] != 321);
@@ -203,6 +189,8 @@ void test_mmap(void)
 	map_data = tmp2;
 	CHECK_FAIL(bss_data->in_val != 321);
 	CHECK_FAIL(bss_data->out_val != 321);
+	CHECK_FAIL(skel->bss->in_val != 321);
+	CHECK_FAIL(skel->bss->out_val != 321);
 	CHECK_FAIL(map_data->val[0] != 111);
 	CHECK_FAIL(map_data->val[1] != 222);
 	CHECK_FAIL(map_data->val[2] != 321);
@@ -214,7 +202,5 @@ cleanup:
 		CHECK_FAIL(munmap(bss_mmaped, bss_sz));
 	if (map_mmaped)
 		CHECK_FAIL(munmap(map_mmaped, map_sz));
-	if (!IS_ERR_OR_NULL(link))
-		bpf_link__destroy(link);
-	bpf_object__close(obj);
+	test_mmap__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
index 3003fddc0613..cf6c87936c69 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
@@ -4,6 +4,7 @@
 #include <sched.h>
 #include <sys/socket.h>
 #include <test_progs.h>
+#include "libbpf_internal.h"
 
 static void on_sample(void *ctx, int cpu, void *data, __u32 size)
 {
@@ -19,7 +20,7 @@ static void on_sample(void *ctx, int cpu, void *data, __u32 size)
 
 void test_perf_buffer(void)
 {
-	int err, prog_fd, nr_cpus, i, duration = 0;
+	int err, prog_fd, on_len, nr_on_cpus = 0,  nr_cpus, i, duration = 0;
 	const char *prog_name = "kprobe/sys_nanosleep";
 	const char *file = "./test_perf_buffer.o";
 	struct perf_buffer_opts pb_opts = {};
@@ -29,15 +30,27 @@ void test_perf_buffer(void)
 	struct bpf_object *obj;
 	struct perf_buffer *pb;
 	struct bpf_link *link;
+	bool *online;
 
 	nr_cpus = libbpf_num_possible_cpus();
 	if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
 		return;
 
+	err = parse_cpu_mask_file("/sys/devices/system/cpu/online",
+				  &online, &on_len);
+	if (CHECK(err, "nr_on_cpus", "err %d\n", err))
+		return;
+
+	for (i = 0; i < on_len; i++)
+		if (online[i])
+			nr_on_cpus++;
+
 	/* load program */
 	err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
-	if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
-		return;
+	if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) {
+		obj = NULL;
+		goto out_close;
+	}
 
 	prog = bpf_object__find_program_by_title(obj, prog_name);
 	if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
@@ -64,6 +77,11 @@ void test_perf_buffer(void)
 	/* trigger kprobe on every CPU */
 	CPU_ZERO(&cpu_seen);
 	for (i = 0; i < nr_cpus; i++) {
+		if (i >= on_len || !online[i]) {
+			printf("skipping offline CPU #%d\n", i);
+			continue;
+		}
+
 		CPU_ZERO(&cpu_set);
 		CPU_SET(i, &cpu_set);
 
@@ -81,8 +99,8 @@ void test_perf_buffer(void)
 	if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
 		goto out_free_pb;
 
-	if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
-		  "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
+	if (CHECK(CPU_COUNT(&cpu_seen) != nr_on_cpus, "seen_cpu_cnt",
+		  "expect %d, seen %d\n", nr_on_cpus, CPU_COUNT(&cpu_seen)))
 		goto out_free_pb;
 
 out_free_pb:
@@ -91,4 +109,5 @@ out_detach:
 	bpf_link__destroy(link);
 out_close:
 	bpf_object__close(obj);
+	free(online);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c
index 8a3187dec048..7aecfd9e87d1 100644
--- a/tools/testing/selftests/bpf/prog_tests/probe_user.c
+++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c
@@ -3,8 +3,7 @@
 
 void test_probe_user(void)
 {
-#define kprobe_name "__sys_connect"
-	const char *prog_name = "kprobe/" kprobe_name;
+	const char *prog_name = "kprobe/__sys_connect";
 	const char *obj_file = "./test_probe_user.o";
 	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, );
 	int err, results_map_fd, sock_fd, duration = 0;
@@ -33,8 +32,7 @@ void test_probe_user(void)
 		  "err %d\n", results_map_fd))
 		goto cleanup;
 
-	kprobe_link = bpf_program__attach_kprobe(kprobe_prog, false,
-						 kprobe_name);
+	kprobe_link = bpf_program__attach(kprobe_prog);
 	if (CHECK(IS_ERR(kprobe_link), "attach_kprobe",
 		  "err %ld\n", PTR_ERR(kprobe_link))) {
 		kprobe_link = NULL;
diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
index d90acc13d1ec..563e12120e77 100644
--- a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
+++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
@@ -16,14 +16,11 @@ struct rdonly_map_subtest {
 
 void test_rdonly_maps(void)
 {
-	const char *prog_name_skip_loop = "raw_tracepoint/sys_enter:skip_loop";
-	const char *prog_name_part_loop = "raw_tracepoint/sys_enter:part_loop";
-	const char *prog_name_full_loop = "raw_tracepoint/sys_enter:full_loop";
 	const char *file = "test_rdonly_maps.o";
 	struct rdonly_map_subtest subtests[] = {
-		{ "skip loop", prog_name_skip_loop, 0, 0 },
-		{ "part loop", prog_name_part_loop, 3, 2 + 3 + 4 },
-		{ "full loop", prog_name_full_loop, 4, 2 + 3 + 4 + 5 },
+		{ "skip loop", "skip_loop", 0, 0 },
+		{ "part loop", "part_loop", 3, 2 + 3 + 4 },
+		{ "full loop", "full_loop", 4, 2 + 3 + 4 + 5 },
 	};
 	int i, err, zero = 0, duration = 0;
 	struct bpf_link *link = NULL;
@@ -50,7 +47,7 @@ void test_rdonly_maps(void)
 		if (!test__start_subtest(t->subtest_name))
 			continue;
 
-		prog = bpf_object__find_program_by_title(obj, t->prog_name);
+		prog = bpf_object__find_program_by_name(obj, t->prog_name);
 		if (CHECK(!prog, "find_prog", "prog '%s' not found\n",
 			  t->prog_name))
 			goto cleanup;
diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 7566c13eb51a..2c37ae7dc214 100644
--- a/tools/testing/selftests/bpf/test_select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -20,8 +20,11 @@
 #include <bpf/libbpf.h>
 #include "bpf_rlimit.h"
 #include "bpf_util.h"
+
+#include "test_progs.h"
 #include "test_select_reuseport_common.h"
 
+#define MAX_TEST_NAME 80
 #define MIN_TCPHDR_LEN 20
 #define UDPHDR_LEN 8
 
@@ -32,11 +35,11 @@
 static int result_map, tmp_index_ovr_map, linum_map, data_check_map;
 static enum result expected_results[NR_RESULTS];
 static int sk_fds[REUSEPORT_ARRAY_SIZE];
-static int reuseport_array, outer_map;
+static int reuseport_array = -1, outer_map = -1;
 static int select_by_skb_data_prog;
-static int saved_tcp_syncookie;
+static int saved_tcp_syncookie = -1;
 static struct bpf_object *obj;
-static int saved_tcp_fo;
+static int saved_tcp_fo = -1;
 static __u32 index_zero;
 static int epfd;
 
@@ -46,16 +49,21 @@ static union sa46 {
 	sa_family_t family;
 } srv_sa;
 
-#define CHECK(condition, tag, format...) ({				\
-	int __ret = !!(condition);					\
-	if (__ret) {							\
-		printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag);	\
-		printf(format);						\
-		exit(-1);						\
+#define RET_IF(condition, tag, format...) ({				\
+	if (CHECK_FAIL(condition)) {					\
+		printf(tag " " format);					\
+		return;							\
+	}								\
+})
+
+#define RET_ERR(condition, tag, format...) ({				\
+	if (CHECK_FAIL(condition)) {					\
+		printf(tag " " format);					\
+		return -1;						\
 	}								\
 })
 
-static void create_maps(void)
+static int create_maps(void)
 {
 	struct bpf_create_map_attr attr = {};
 
@@ -67,8 +75,8 @@ static void create_maps(void)
 	attr.max_entries = REUSEPORT_ARRAY_SIZE;
 
 	reuseport_array = bpf_create_map_xattr(&attr);
-	CHECK(reuseport_array == -1, "creating reuseport_array",
-	      "reuseport_array:%d errno:%d\n", reuseport_array, errno);
+	RET_ERR(reuseport_array == -1, "creating reuseport_array",
+		"reuseport_array:%d errno:%d\n", reuseport_array, errno);
 
 	/* Creating outer_map */
 	attr.name = "outer_map";
@@ -78,63 +86,61 @@ static void create_maps(void)
 	attr.max_entries = 1;
 	attr.inner_map_fd = reuseport_array;
 	outer_map = bpf_create_map_xattr(&attr);
-	CHECK(outer_map == -1, "creating outer_map",
-	      "outer_map:%d errno:%d\n", outer_map, errno);
+	RET_ERR(outer_map == -1, "creating outer_map",
+		"outer_map:%d errno:%d\n", outer_map, errno);
+
+	return 0;
 }
 
-static void prepare_bpf_obj(void)
+static int prepare_bpf_obj(void)
 {
 	struct bpf_program *prog;
 	struct bpf_map *map;
 	int err;
-	struct bpf_object_open_attr attr = {
-		.file = "test_select_reuseport_kern.o",
-		.prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
-	};
 
-	obj = bpf_object__open_xattr(&attr);
-	CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o",
-	      "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj));
-
-	prog = bpf_program__next(NULL, obj);
-	CHECK(!prog, "get first bpf_program", "!prog\n");
-	bpf_program__set_type(prog, attr.prog_type);
+	obj = bpf_object__open("test_select_reuseport_kern.o");
+	RET_ERR(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o",
+		"obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj));
 
 	map = bpf_object__find_map_by_name(obj, "outer_map");
-	CHECK(!map, "find outer_map", "!map\n");
+	RET_ERR(!map, "find outer_map", "!map\n");
 	err = bpf_map__reuse_fd(map, outer_map);
-	CHECK(err, "reuse outer_map", "err:%d\n", err);
+	RET_ERR(err, "reuse outer_map", "err:%d\n", err);
 
 	err = bpf_object__load(obj);
-	CHECK(err, "load bpf_object", "err:%d\n", err);
+	RET_ERR(err, "load bpf_object", "err:%d\n", err);
 
+	prog = bpf_program__next(NULL, obj);
+	RET_ERR(!prog, "get first bpf_program", "!prog\n");
 	select_by_skb_data_prog = bpf_program__fd(prog);
-	CHECK(select_by_skb_data_prog == -1, "get prog fd",
-	      "select_by_skb_data_prog:%d\n", select_by_skb_data_prog);
+	RET_ERR(select_by_skb_data_prog == -1, "get prog fd",
+		"select_by_skb_data_prog:%d\n", select_by_skb_data_prog);
 
 	map = bpf_object__find_map_by_name(obj, "result_map");
-	CHECK(!map, "find result_map", "!map\n");
+	RET_ERR(!map, "find result_map", "!map\n");
 	result_map = bpf_map__fd(map);
-	CHECK(result_map == -1, "get result_map fd",
-	      "result_map:%d\n", result_map);
+	RET_ERR(result_map == -1, "get result_map fd",
+		"result_map:%d\n", result_map);
 
 	map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map");
-	CHECK(!map, "find tmp_index_ovr_map", "!map\n");
+	RET_ERR(!map, "find tmp_index_ovr_map\n", "!map");
 	tmp_index_ovr_map = bpf_map__fd(map);
-	CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd",
-	      "tmp_index_ovr_map:%d\n", tmp_index_ovr_map);
+	RET_ERR(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd",
+		"tmp_index_ovr_map:%d\n", tmp_index_ovr_map);
 
 	map = bpf_object__find_map_by_name(obj, "linum_map");
-	CHECK(!map, "find linum_map", "!map\n");
+	RET_ERR(!map, "find linum_map", "!map\n");
 	linum_map = bpf_map__fd(map);
-	CHECK(linum_map == -1, "get linum_map fd",
-	      "linum_map:%d\n", linum_map);
+	RET_ERR(linum_map == -1, "get linum_map fd",
+		"linum_map:%d\n", linum_map);
 
 	map = bpf_object__find_map_by_name(obj, "data_check_map");
-	CHECK(!map, "find data_check_map", "!map\n");
+	RET_ERR(!map, "find data_check_map", "!map\n");
 	data_check_map = bpf_map__fd(map);
-	CHECK(data_check_map == -1, "get data_check_map fd",
-	      "data_check_map:%d\n", data_check_map);
+	RET_ERR(data_check_map == -1, "get data_check_map fd",
+		"data_check_map:%d\n", data_check_map);
+
+	return 0;
 }
 
 static void sa46_init_loopback(union sa46 *sa, sa_family_t family)
@@ -163,65 +169,73 @@ static int read_int_sysctl(const char *sysctl)
 	int fd, ret;
 
 	fd = open(sysctl, 0);
-	CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n",
-	      sysctl, fd, errno);
+	RET_ERR(fd == -1, "open(sysctl)",
+		"sysctl:%s fd:%d errno:%d\n", sysctl, fd, errno);
 
 	ret = read(fd, buf, sizeof(buf));
-	CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n",
-	      sysctl, ret, errno);
-	close(fd);
+	RET_ERR(ret <= 0, "read(sysctl)",
+		"sysctl:%s ret:%d errno:%d\n", sysctl, ret, errno);
 
+	close(fd);
 	return atoi(buf);
 }
 
-static void write_int_sysctl(const char *sysctl, int v)
+static int write_int_sysctl(const char *sysctl, int v)
 {
 	int fd, ret, size;
 	char buf[16];
 
 	fd = open(sysctl, O_RDWR);
-	CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n",
-	      sysctl, fd, errno);
+	RET_ERR(fd == -1, "open(sysctl)",
+		"sysctl:%s fd:%d errno:%d\n", sysctl, fd, errno);
 
 	size = snprintf(buf, sizeof(buf), "%d", v);
 	ret = write(fd, buf, size);
-	CHECK(ret != size, "write(sysctl)",
-	      "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno);
+	RET_ERR(ret != size, "write(sysctl)",
+		"sysctl:%s ret:%d size:%d errno:%d\n",
+		sysctl, ret, size, errno);
+
 	close(fd);
+	return 0;
 }
 
 static void restore_sysctls(void)
 {
-	write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo);
-	write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie);
+	if (saved_tcp_fo != -1)
+		write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo);
+	if (saved_tcp_syncookie != -1)
+		write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie);
 }
 
-static void enable_fastopen(void)
+static int enable_fastopen(void)
 {
 	int fo;
 
 	fo = read_int_sysctl(TCP_FO_SYSCTL);
-	write_int_sysctl(TCP_FO_SYSCTL, fo | 7);
+	if (fo < 0)
+		return -1;
+
+	return write_int_sysctl(TCP_FO_SYSCTL, fo | 7);
 }
 
-static void enable_syncookie(void)
+static int enable_syncookie(void)
 {
-	write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2);
+	return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2);
 }
 
-static void disable_syncookie(void)
+static int disable_syncookie(void)
 {
-	write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0);
+	return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0);
 }
 
-static __u32 get_linum(void)
+static long get_linum(void)
 {
 	__u32 linum;
 	int err;
 
 	err = bpf_map_lookup_elem(linum_map, &index_zero, &linum);
-	CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n",
-	      err, errno);
+	RET_ERR(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n",
+		err, errno);
 
 	return linum;
 }
@@ -237,12 +251,12 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
 	addrlen = sizeof(cli_sa);
 	err = getsockname(cli_fd, (struct sockaddr *)&cli_sa,
 			  &addrlen);
-	CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n",
-	      err, errno);
+	RET_IF(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n",
+	       err, errno);
 
 	err = bpf_map_lookup_elem(data_check_map, &index_zero, &result);
-	CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n",
-	      err, errno);
+	RET_IF(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n",
+	       err, errno);
 
 	if (type == SOCK_STREAM) {
 		expected.len = MIN_TCPHDR_LEN;
@@ -284,22 +298,22 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
 		printf("expected: (0x%x, %u, %u)\n",
 		       expected.eth_protocol, expected.ip_protocol,
 		       expected.bind_inany);
-		CHECK(1, "data_check result != expected",
-		      "bpf_prog_linum:%u\n", get_linum());
+		RET_IF(1, "data_check result != expected",
+		       "bpf_prog_linum:%ld\n", get_linum());
 	}
 
-	CHECK(!result.hash, "data_check result.hash empty",
-	      "result.hash:%u", result.hash);
+	RET_IF(!result.hash, "data_check result.hash empty",
+	       "result.hash:%u", result.hash);
 
 	expected.len += cmd ? sizeof(*cmd) : 0;
 	if (type == SOCK_STREAM)
-		CHECK(expected.len > result.len, "expected.len > result.len",
-		      "expected.len:%u result.len:%u bpf_prog_linum:%u\n",
-		      expected.len, result.len, get_linum());
+		RET_IF(expected.len > result.len, "expected.len > result.len",
+		       "expected.len:%u result.len:%u bpf_prog_linum:%ld\n",
+		       expected.len, result.len, get_linum());
 	else
-		CHECK(expected.len != result.len, "expected.len != result.len",
-		      "expected.len:%u result.len:%u bpf_prog_linum:%u\n",
-		      expected.len, result.len, get_linum());
+		RET_IF(expected.len != result.len, "expected.len != result.len",
+		       "expected.len:%u result.len:%u bpf_prog_linum:%ld\n",
+		       expected.len, result.len, get_linum());
 }
 
 static void check_results(void)
@@ -310,8 +324,8 @@ static void check_results(void)
 
 	for (i = 0; i < NR_RESULTS; i++) {
 		err = bpf_map_lookup_elem(result_map, &i, &results[i]);
-		CHECK(err == -1, "lookup_elem(result_map)",
-		      "i:%u err:%d errno:%d\n", i, err, errno);
+		RET_IF(err == -1, "lookup_elem(result_map)",
+		       "i:%u err:%d errno:%d\n", i, err, errno);
 	}
 
 	for (i = 0; i < NR_RESULTS; i++) {
@@ -337,10 +351,10 @@ static void check_results(void)
 		printf(", %u", expected_results[i]);
 	printf("]\n");
 
-	CHECK(expected_results[broken] != results[broken],
-	      "unexpected result",
-	      "expected_results[%u] != results[%u] bpf_prog_linum:%u\n",
-	      broken, broken, get_linum());
+	RET_IF(expected_results[broken] != results[broken],
+	       "unexpected result",
+	       "expected_results[%u] != results[%u] bpf_prog_linum:%ld\n",
+	       broken, broken, get_linum());
 }
 
 static int send_data(int type, sa_family_t family, void *data, size_t len,
@@ -350,17 +364,17 @@ static int send_data(int type, sa_family_t family, void *data, size_t len,
 	int fd, err;
 
 	fd = socket(family, type, 0);
-	CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno);
+	RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno);
 
 	sa46_init_loopback(&cli_sa, family);
 	err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa));
-	CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno);
+	RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno);
 
 	err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa,
 		     sizeof(srv_sa));
-	CHECK(err != len && expected >= PASS,
-	      "sendto()", "family:%u err:%d errno:%d expected:%d\n",
-	      family, err, errno, expected);
+	RET_ERR(err != len && expected >= PASS,
+		"sendto()", "family:%u err:%d errno:%d expected:%d\n",
+		family, err, errno, expected);
 
 	return fd;
 }
@@ -375,47 +389,49 @@ static void do_test(int type, sa_family_t family, struct cmd *cmd,
 
 	cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0,
 			   expected);
+	if (cli_fd < 0)
+		return;
 	nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0);
-	CHECK((nev <= 0 && expected >= PASS) ||
-	      (nev > 0 && expected < PASS),
-	      "nev <> expected",
-	      "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n",
-	      nev, expected, type, family,
-	      cmd ? cmd->reuseport_index : -1,
-	      cmd ? cmd->pass_on_failure : -1);
+	RET_IF((nev <= 0 && expected >= PASS) ||
+	       (nev > 0 && expected < PASS),
+	       "nev <> expected",
+	       "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n",
+	       nev, expected, type, family,
+	       cmd ? cmd->reuseport_index : -1,
+	       cmd ? cmd->pass_on_failure : -1);
 	check_results();
 	check_data(type, family, cmd, cli_fd);
 
 	if (expected < PASS)
 		return;
 
-	CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT &&
-	      cmd->reuseport_index != ev.data.u32,
-	      "check cmd->reuseport_index",
-	      "cmd:(%u, %u) ev.data.u32:%u\n",
-	      cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32);
+	RET_IF(expected != PASS_ERR_SK_SELECT_REUSEPORT &&
+	       cmd->reuseport_index != ev.data.u32,
+	       "check cmd->reuseport_index",
+	       "cmd:(%u, %u) ev.data.u32:%u\n",
+	       cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32);
 
 	srv_fd = sk_fds[ev.data.u32];
 	if (type == SOCK_STREAM) {
 		int new_fd = accept(srv_fd, NULL, 0);
 
-		CHECK(new_fd == -1, "accept(srv_fd)",
-		      "ev.data.u32:%u new_fd:%d errno:%d\n",
-		      ev.data.u32, new_fd, errno);
+		RET_IF(new_fd == -1, "accept(srv_fd)",
+		       "ev.data.u32:%u new_fd:%d errno:%d\n",
+		       ev.data.u32, new_fd, errno);
 
 		nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT);
-		CHECK(nread != sizeof(rcv_cmd),
-		      "recv(new_fd)",
-		      "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
-		      ev.data.u32, nread, sizeof(rcv_cmd), errno);
+		RET_IF(nread != sizeof(rcv_cmd),
+		       "recv(new_fd)",
+		       "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
+		       ev.data.u32, nread, sizeof(rcv_cmd), errno);
 
 		close(new_fd);
 	} else {
 		nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT);
-		CHECK(nread != sizeof(rcv_cmd),
-		      "recv(sk_fds)",
-		      "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
-		      ev.data.u32, nread, sizeof(rcv_cmd), errno);
+		RET_IF(nread != sizeof(rcv_cmd),
+		       "recv(sk_fds)",
+		       "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
+		       ev.data.u32, nread, sizeof(rcv_cmd), errno);
 	}
 
 	close(cli_fd);
@@ -428,18 +444,14 @@ static void test_err_inner_map(int type, sa_family_t family)
 		.pass_on_failure = 0,
 	};
 
-	printf("%s: ", __func__);
 	expected_results[DROP_ERR_INNER_MAP]++;
 	do_test(type, family, &cmd, DROP_ERR_INNER_MAP);
-	printf("OK\n");
 }
 
 static void test_err_skb_data(int type, sa_family_t family)
 {
-	printf("%s: ", __func__);
 	expected_results[DROP_ERR_SKB_DATA]++;
 	do_test(type, family, NULL, DROP_ERR_SKB_DATA);
-	printf("OK\n");
 }
 
 static void test_err_sk_select_port(int type, sa_family_t family)
@@ -449,10 +461,8 @@ static void test_err_sk_select_port(int type, sa_family_t family)
 		.pass_on_failure = 0,
 	};
 
-	printf("%s: ", __func__);
 	expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++;
 	do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT);
-	printf("OK\n");
 }
 
 static void test_pass(int type, sa_family_t family)
@@ -460,14 +470,12 @@ static void test_pass(int type, sa_family_t family)
 	struct cmd cmd;
 	int i;
 
-	printf("%s: ", __func__);
 	cmd.pass_on_failure = 0;
 	for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) {
 		expected_results[PASS]++;
 		cmd.reuseport_index = i;
 		do_test(type, family, &cmd, PASS);
 	}
-	printf("OK\n");
 }
 
 static void test_syncookie(int type, sa_family_t family)
@@ -481,7 +489,6 @@ static void test_syncookie(int type, sa_family_t family)
 	if (type != SOCK_STREAM)
 		return;
 
-	printf("%s: ", __func__);
 	/*
 	 * +1 for TCP-SYN and
 	 * +1 for the TCP-ACK (ack the syncookie)
@@ -497,17 +504,16 @@ static void test_syncookie(int type, sa_family_t family)
 	 */
 	err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero,
 				  &tmp_index, BPF_ANY);
-	CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)",
-	      "err:%d errno:%d\n", err, errno);
+	RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)",
+	       "err:%d errno:%d\n", err, errno);
 	do_test(type, family, &cmd, PASS);
 	err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero,
 				  &tmp_index);
-	CHECK(err == -1 || tmp_index != -1,
-	      "lookup_elem(tmp_index_ovr_map)",
-	      "err:%d errno:%d tmp_index:%d\n",
-	      err, errno, tmp_index);
+	RET_IF(err == -1 || tmp_index != -1,
+	       "lookup_elem(tmp_index_ovr_map)",
+	       "err:%d errno:%d tmp_index:%d\n",
+	       err, errno, tmp_index);
 	disable_syncookie();
-	printf("OK\n");
 }
 
 static void test_pass_on_err(int type, sa_family_t family)
@@ -517,10 +523,8 @@ static void test_pass_on_err(int type, sa_family_t family)
 		.pass_on_failure = 1,
 	};
 
-	printf("%s: ", __func__);
 	expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1;
 	do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT);
-	printf("OK\n");
 }
 
 static void test_detach_bpf(int type, sa_family_t family)
@@ -532,46 +536,47 @@ static void test_detach_bpf(int type, sa_family_t family)
 	struct cmd cmd = {};
 	int optvalue = 0;
 
-	printf("%s: ", __func__);
 	err = setsockopt(sk_fds[0], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF,
 			 &optvalue, sizeof(optvalue));
-	CHECK(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)",
-	      "err:%d errno:%d\n", err, errno);
+	RET_IF(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)",
+	       "err:%d errno:%d\n", err, errno);
 
 	err = setsockopt(sk_fds[1], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF,
 			 &optvalue, sizeof(optvalue));
-	CHECK(err == 0 || errno != ENOENT, "setsockopt(SO_DETACH_REUSEPORT_BPF)",
-	      "err:%d errno:%d\n", err, errno);
+	RET_IF(err == 0 || errno != ENOENT,
+	       "setsockopt(SO_DETACH_REUSEPORT_BPF)",
+	       "err:%d errno:%d\n", err, errno);
 
 	for (i = 0; i < NR_RESULTS; i++) {
 		err = bpf_map_lookup_elem(result_map, &i, &tmp);
-		CHECK(err == -1, "lookup_elem(result_map)",
-		      "i:%u err:%d errno:%d\n", i, err, errno);
+		RET_IF(err == -1, "lookup_elem(result_map)",
+		       "i:%u err:%d errno:%d\n", i, err, errno);
 		nr_run_before += tmp;
 	}
 
 	cli_fd = send_data(type, family, &cmd, sizeof(cmd), PASS);
+	if (cli_fd < 0)
+		return;
 	nev = epoll_wait(epfd, &ev, 1, 5);
-	CHECK(nev <= 0, "nev <= 0",
-	      "nev:%d expected:1 type:%d family:%d data:(0, 0)\n",
-	      nev,  type, family);
+	RET_IF(nev <= 0, "nev <= 0",
+	       "nev:%d expected:1 type:%d family:%d data:(0, 0)\n",
+	       nev,  type, family);
 
 	for (i = 0; i < NR_RESULTS; i++) {
 		err = bpf_map_lookup_elem(result_map, &i, &tmp);
-		CHECK(err == -1, "lookup_elem(result_map)",
-		      "i:%u err:%d errno:%d\n", i, err, errno);
+		RET_IF(err == -1, "lookup_elem(result_map)",
+		       "i:%u err:%d errno:%d\n", i, err, errno);
 		nr_run_after += tmp;
 	}
 
-	CHECK(nr_run_before != nr_run_after,
-	      "nr_run_before != nr_run_after",
-	      "nr_run_before:%u nr_run_after:%u\n",
-	      nr_run_before, nr_run_after);
+	RET_IF(nr_run_before != nr_run_after,
+	       "nr_run_before != nr_run_after",
+	       "nr_run_before:%u nr_run_after:%u\n",
+	       nr_run_before, nr_run_after);
 
-	printf("OK\n");
 	close(cli_fd);
 #else
-	printf("%s: SKIP\n", __func__);
+	test__skip();
 #endif
 }
 
@@ -594,73 +599,83 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany)
 	 */
 	for (i = first; i >= 0; i--) {
 		sk_fds[i] = socket(family, type, 0);
-		CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n",
-		      i, sk_fds[i], errno);
+		RET_IF(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n",
+		       i, sk_fds[i], errno);
 		err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT,
 				 &optval, sizeof(optval));
-		CHECK(err == -1, "setsockopt(SO_REUSEPORT)",
-		      "sk_fds[%d] err:%d errno:%d\n",
-		      i, err, errno);
+		RET_IF(err == -1, "setsockopt(SO_REUSEPORT)",
+		       "sk_fds[%d] err:%d errno:%d\n",
+		       i, err, errno);
 
 		if (i == first) {
 			err = setsockopt(sk_fds[i], SOL_SOCKET,
 					 SO_ATTACH_REUSEPORT_EBPF,
 					 &select_by_skb_data_prog,
 					 sizeof(select_by_skb_data_prog));
-			CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)",
-			      "err:%d errno:%d\n", err, errno);
+			RET_IF(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)",
+			       "err:%d errno:%d\n", err, errno);
 		}
 
 		err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen);
-		CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n",
-		      i, err, errno);
+		RET_IF(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n",
+		       i, err, errno);
 
 		if (type == SOCK_STREAM) {
 			err = listen(sk_fds[i], 10);
-			CHECK(err == -1, "listen()",
-			      "sk_fds[%d] err:%d errno:%d\n",
-			      i, err, errno);
+			RET_IF(err == -1, "listen()",
+			       "sk_fds[%d] err:%d errno:%d\n",
+			       i, err, errno);
 		}
 
 		err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i],
 					  BPF_NOEXIST);
-		CHECK(err == -1, "update_elem(reuseport_array)",
-		      "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
+		RET_IF(err == -1, "update_elem(reuseport_array)",
+		       "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
 
 		if (i == first) {
 			socklen_t addrlen = sizeof(srv_sa);
 
 			err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa,
 					  &addrlen);
-			CHECK(err == -1, "getsockname()",
-			      "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
+			RET_IF(err == -1, "getsockname()",
+			       "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
 		}
 	}
 
 	epfd = epoll_create(1);
-	CHECK(epfd == -1, "epoll_create(1)",
-	      "epfd:%d errno:%d\n", epfd, errno);
+	RET_IF(epfd == -1, "epoll_create(1)",
+	       "epfd:%d errno:%d\n", epfd, errno);
 
 	ev.events = EPOLLIN;
 	for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) {
 		ev.data.u32 = i;
 		err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev);
-		CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i);
+		RET_IF(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i);
 	}
 }
 
-static void setup_per_test(int type, unsigned short family, bool inany)
+static void setup_per_test(int type, sa_family_t family, bool inany,
+			   bool no_inner_map)
 {
 	int ovr = -1, err;
 
 	prepare_sk_fds(type, family, inany);
 	err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr,
 				  BPF_ANY);
-	CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)",
-	      "err:%d errno:%d\n", err, errno);
+	RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)",
+	       "err:%d errno:%d\n", err, errno);
+
+	/* Install reuseport_array to outer_map? */
+	if (no_inner_map)
+		return;
+
+	err = bpf_map_update_elem(outer_map, &index_zero, &reuseport_array,
+				  BPF_ANY);
+	RET_IF(err == -1, "update_elem(outer_map, 0, reuseport_array)",
+	       "err:%d errno:%d\n", err, errno);
 }
 
-static void cleanup_per_test(void)
+static void cleanup_per_test(bool no_inner_map)
 {
 	int i, err;
 
@@ -668,75 +683,124 @@ static void cleanup_per_test(void)
 		close(sk_fds[i]);
 	close(epfd);
 
+	/* Delete reuseport_array from outer_map? */
+	if (no_inner_map)
+		return;
+
 	err = bpf_map_delete_elem(outer_map, &index_zero);
-	CHECK(err == -1, "delete_elem(outer_map)",
-	      "err:%d errno:%d\n", err, errno);
+	RET_IF(err == -1, "delete_elem(outer_map)",
+	       "err:%d errno:%d\n", err, errno);
 }
 
 static void cleanup(void)
 {
-	close(outer_map);
-	close(reuseport_array);
-	bpf_object__close(obj);
+	if (outer_map != -1)
+		close(outer_map);
+	if (reuseport_array != -1)
+		close(reuseport_array);
+	if (obj)
+		bpf_object__close(obj);
 }
 
-static void test_all(void)
+static const char *family_str(sa_family_t family)
 {
-	/* Extra SOCK_STREAM to test bind_inany==true */
-	const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM };
-	const char * const type_strings[] = { "TCP", "UDP", "TCP" };
-	const char * const family_strings[] = { "IPv6", "IPv4" };
-	const unsigned short families[] = { AF_INET6, AF_INET };
-	const bool bind_inany[] = { false, false, true };
-	int t, f, err;
-
-	for (f = 0; f < ARRAY_SIZE(families); f++) {
-		unsigned short family = families[f];
-
-		for (t = 0; t < ARRAY_SIZE(types); t++) {
-			bool inany = bind_inany[t];
-			int type = types[t];
-
-			printf("######## %s/%s %s ########\n",
-			       family_strings[f], type_strings[t],
-				inany ? " INANY  " : "LOOPBACK");
-
-			setup_per_test(type, family, inany);
-
-			test_err_inner_map(type, family);
-
-			/* Install reuseport_array to the outer_map */
-			err = bpf_map_update_elem(outer_map, &index_zero,
-						  &reuseport_array, BPF_ANY);
-			CHECK(err == -1, "update_elem(outer_map)",
-			      "err:%d errno:%d\n", err, errno);
-
-			test_err_skb_data(type, family);
-			test_err_sk_select_port(type, family);
-			test_pass(type, family);
-			test_syncookie(type, family);
-			test_pass_on_err(type, family);
-			/* Must be the last test */
-			test_detach_bpf(type, family);
-
-			cleanup_per_test();
-			printf("\n");
-		}
+	switch (family) {
+	case AF_INET:
+		return "IPv4";
+	case AF_INET6:
+		return "IPv6";
+	default:
+		return "unknown";
+	}
+}
+
+static const char *sotype_str(int sotype)
+{
+	switch (sotype) {
+	case SOCK_STREAM:
+		return "TCP";
+	case SOCK_DGRAM:
+		return "UDP";
+	default:
+		return "unknown";
+	}
+}
+
+#define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ }
+
+static void test_config(int sotype, sa_family_t family, bool inany)
+{
+	const struct test {
+		void (*fn)(int sotype, sa_family_t family);
+		const char *name;
+		bool no_inner_map;
+	} tests[] = {
+		TEST_INIT(test_err_inner_map, true /* no_inner_map */),
+		TEST_INIT(test_err_skb_data),
+		TEST_INIT(test_err_sk_select_port),
+		TEST_INIT(test_pass),
+		TEST_INIT(test_syncookie),
+		TEST_INIT(test_pass_on_err),
+		TEST_INIT(test_detach_bpf),
+	};
+	char s[MAX_TEST_NAME];
+	const struct test *t;
+
+	for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+		snprintf(s, sizeof(s), "%s/%s %s %s",
+			 family_str(family), sotype_str(sotype),
+			 inany ? "INANY" : "LOOPBACK", t->name);
+
+		if (!test__start_subtest(s))
+			continue;
+
+		setup_per_test(sotype, family, inany, t->no_inner_map);
+		t->fn(sotype, family);
+		cleanup_per_test(t->no_inner_map);
 	}
 }
 
-int main(int argc, const char **argv)
+#define BIND_INANY true
+
+static void test_all(void)
 {
-	create_maps();
-	prepare_bpf_obj();
+	const struct config {
+		int sotype;
+		sa_family_t family;
+		bool inany;
+	} configs[] = {
+		{ SOCK_STREAM, AF_INET },
+		{ SOCK_STREAM, AF_INET, BIND_INANY },
+		{ SOCK_STREAM, AF_INET6 },
+		{ SOCK_STREAM, AF_INET6, BIND_INANY },
+		{ SOCK_DGRAM, AF_INET },
+		{ SOCK_DGRAM, AF_INET6 },
+	};
+	const struct config *c;
+
+	for (c = configs; c < configs + ARRAY_SIZE(configs); c++)
+		test_config(c->sotype, c->family, c->inany);
+}
+
+void test_select_reuseport(void)
+{
+	if (create_maps())
+		goto out;
+	if (prepare_bpf_obj())
+		goto out;
+
 	saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL);
 	saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL);
-	enable_fastopen();
-	disable_syncookie();
-	atexit(restore_sysctls);
+	if (saved_tcp_syncookie < 0 || saved_tcp_syncookie < 0)
+		goto out;
 
-	test_all();
+	if (enable_fastopen())
+		goto out;
+	if (disable_syncookie())
+		goto out;
 
+	test_all();
+out:
 	cleanup();
-	return 0;
+	restore_sysctls();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
index a2eb8db8dafb..c6d6b685a946 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
@@ -11,6 +11,9 @@ void test_skb_ctx(void)
 		.cb[4] = 5,
 		.priority = 6,
 		.tstamp = 7,
+		.wire_len = 100,
+		.gso_segs = 8,
+		.mark = 9,
 	};
 	struct bpf_prog_test_run_attr tattr = {
 		.data_in = &pkt_v4,
@@ -91,4 +94,8 @@ void test_skb_ctx(void)
 		   "ctx_out_tstamp",
 		   "skb->tstamp == %lld, expected %d\n",
 		   skb.tstamp, 8);
+	CHECK_ATTR(skb.mark != 10,
+		   "ctx_out_mark",
+		   "skb->mark == %u, expected %d\n",
+		   skb.mark, 10);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c
new file mode 100644
index 000000000000..9264a2736018
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <test_progs.h>
+
+struct s {
+	int a;
+	long long b;
+} __attribute__((packed));
+
+#include "test_skeleton.skel.h"
+
+void test_skeleton(void)
+{
+	int duration = 0, err;
+	struct test_skeleton* skel;
+	struct test_skeleton__bss *bss;
+	struct test_skeleton__kconfig *kcfg;
+
+	skel = test_skeleton__open();
+	if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+		return;
+
+	if (CHECK(skel->kconfig, "skel_kconfig", "kconfig is mmaped()!\n"))
+		goto cleanup;
+
+	err = test_skeleton__load(skel);
+	if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err))
+		goto cleanup;
+
+	bss = skel->bss;
+	bss->in1 = 1;
+	bss->in2 = 2;
+	bss->in3 = 3;
+	bss->in4 = 4;
+	bss->in5.a = 5;
+	bss->in5.b = 6;
+	kcfg = skel->kconfig;
+
+	err = test_skeleton__attach(skel);
+	if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+		goto cleanup;
+
+	/* trigger tracepoint */
+	usleep(1);
+
+	CHECK(bss->out1 != 1, "res1", "got %d != exp %d\n", bss->out1, 1);
+	CHECK(bss->out2 != 2, "res2", "got %lld != exp %d\n", bss->out2, 2);
+	CHECK(bss->out3 != 3, "res3", "got %d != exp %d\n", (int)bss->out3, 3);
+	CHECK(bss->out4 != 4, "res4", "got %lld != exp %d\n", bss->out4, 4);
+	CHECK(bss->handler_out5.a != 5, "res5", "got %d != exp %d\n",
+	      bss->handler_out5.a, 5);
+	CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n",
+	      bss->handler_out5.b, 6);
+
+	CHECK(bss->bpf_syscall != kcfg->CONFIG_BPF_SYSCALL, "ext1",
+	      "got %d != exp %d\n", bss->bpf_syscall, kcfg->CONFIG_BPF_SYSCALL);
+	CHECK(bss->kern_ver != kcfg->LINUX_KERNEL_VERSION, "ext2",
+	      "got %d != exp %d\n", bss->kern_ver, kcfg->LINUX_KERNEL_VERSION);
+
+cleanup:
+	test_skeleton__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
index d841dced971f..e8399ae50e77 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
@@ -1,16 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
+#include "test_stacktrace_build_id.skel.h"
 
 void test_stacktrace_build_id(void)
 {
+
 	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
-	const char *prog_name = "tracepoint/random/urandom_read";
-	const char *file = "./test_stacktrace_build_id.o";
-	int err, prog_fd, stack_trace_len;
+	struct test_stacktrace_build_id *skel;
+	int err, stack_trace_len;
 	__u32 key, previous_key, val, duration = 0;
-	struct bpf_program *prog;
-	struct bpf_object *obj;
-	struct bpf_link *link = NULL;
 	char buf[256];
 	int i, j;
 	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
@@ -18,43 +16,24 @@ void test_stacktrace_build_id(void)
 	int retry = 1;
 
 retry:
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+	skel = test_stacktrace_build_id__open_and_load();
+	if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n"))
 		return;
 
-	prog = bpf_object__find_program_by_title(obj, prog_name);
-	if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name))
-		goto close_prog;
-
-	link = bpf_program__attach_tracepoint(prog, "random", "urandom_read");
-	if (CHECK(IS_ERR(link), "attach_tp", "err %ld\n", PTR_ERR(link)))
-		goto close_prog;
+	err = test_stacktrace_build_id__attach(skel);
+	if (CHECK(err, "attach_tp", "err %d\n", err))
+		goto cleanup;
 
 	/* find map fds */
-	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
-		  err, errno))
-		goto disable_pmu;
-
-	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
-	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+	control_map_fd = bpf_map__fd(skel->maps.control_map);
+	stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap);
+	stackmap_fd = bpf_map__fd(skel->maps.stackmap);
+	stack_amap_fd = bpf_map__fd(skel->maps.stack_amap);
 
 	if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
-		goto disable_pmu;
+		goto cleanup;
 	if (CHECK_FAIL(system("./urandom_read")))
-		goto disable_pmu;
+		goto cleanup;
 	/* disable stack trace collection */
 	key = 0;
 	val = 1;
@@ -66,23 +45,23 @@ retry:
 	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
 	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
 	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	err = extract_build_id(buf, 256);
 
 	if (CHECK(err, "get build_id with readelf",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
 	if (CHECK(err, "get_next_key from stackmap",
 		  "err %d, errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	do {
 		char build_id[64];
@@ -90,7 +69,7 @@ retry:
 		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
 		if (CHECK(err, "lookup_elem from stackmap",
 			  "err %d, errno %d\n", err, errno))
-			goto disable_pmu;
+			goto cleanup;
 		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
 			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
 			    id_offs[i].offset != 0) {
@@ -108,8 +87,7 @@ retry:
 	 * try it one more time.
 	 */
 	if (build_id_matches < 1 && retry--) {
-		bpf_link__destroy(link);
-		bpf_object__close(obj);
+		test_stacktrace_build_id__destroy(skel);
 		printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
 		       __func__);
 		goto retry;
@@ -117,17 +95,14 @@ retry:
 
 	if (CHECK(build_id_matches < 1, "build id match",
 		  "Didn't find expected build ID from the map\n"))
-		goto disable_pmu;
+		goto cleanup;
 
-	stack_trace_len = PERF_MAX_STACK_DEPTH
-		* sizeof(struct bpf_stack_build_id);
+	stack_trace_len = PERF_MAX_STACK_DEPTH *
+			  sizeof(struct bpf_stack_build_id);
 	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
 	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
 	      "err %d errno %d\n", err, errno);
 
-disable_pmu:
-	bpf_link__destroy(link);
-
-close_prog:
-	bpf_object__close(obj);
+cleanup:
+	test_stacktrace_build_id__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
index f62aa0eb959b..8974450a4bdb 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
+#include "test_stacktrace_build_id.skel.h"
 
 static __u64 read_perf_max_sample_freq(void)
 {
@@ -16,19 +17,15 @@ static __u64 read_perf_max_sample_freq(void)
 
 void test_stacktrace_build_id_nmi(void)
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
-	const char *prog_name = "tracepoint/random/urandom_read";
-	const char *file = "./test_stacktrace_build_id.o";
-	int err, pmu_fd, prog_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	struct test_stacktrace_build_id *skel;
+	int err, pmu_fd;
 	struct perf_event_attr attr = {
 		.freq = 1,
 		.type = PERF_TYPE_HARDWARE,
 		.config = PERF_COUNT_HW_CPU_CYCLES,
 	};
 	__u32 key, previous_key, val, duration = 0;
-	struct bpf_program *prog;
-	struct bpf_object *obj;
-	struct bpf_link *link;
 	char buf[256];
 	int i, j;
 	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
@@ -38,13 +35,16 @@ void test_stacktrace_build_id_nmi(void)
 	attr.sample_freq = read_perf_max_sample_freq();
 
 retry:
-	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+	skel = test_stacktrace_build_id__open();
+	if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
 		return;
 
-	prog = bpf_object__find_program_by_title(obj, prog_name);
-	if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name))
-		goto close_prog;
+	/* override program type */
+	bpf_program__set_perf_event(skel->progs.oncpu);
+
+	err = test_stacktrace_build_id__load(skel);
+	if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
+		goto cleanup;
 
 	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
 			 0 /* cpu 0 */, -1 /* group id */,
@@ -52,40 +52,25 @@ retry:
 	if (CHECK(pmu_fd < 0, "perf_event_open",
 		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
 		  pmu_fd, errno))
-		goto close_prog;
+		goto cleanup;
 
-	link = bpf_program__attach_perf_event(prog, pmu_fd);
-	if (CHECK(IS_ERR(link), "attach_perf_event",
-		  "err %ld\n", PTR_ERR(link))) {
+	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+							   pmu_fd);
+	if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event",
+		  "err %ld\n", PTR_ERR(skel->links.oncpu))) {
 		close(pmu_fd);
-		goto close_prog;
+		goto cleanup;
 	}
 
 	/* find map fds */
-	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
-		  err, errno))
-		goto disable_pmu;
-
-	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
-	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+	control_map_fd = bpf_map__fd(skel->maps.control_map);
+	stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap);
+	stackmap_fd = bpf_map__fd(skel->maps.stackmap);
 
 	if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
-		goto disable_pmu;
+		goto cleanup;
 	if (CHECK_FAIL(system("taskset 0x1 ./urandom_read 100000")))
-		goto disable_pmu;
+		goto cleanup;
 	/* disable stack trace collection */
 	key = 0;
 	val = 1;
@@ -97,23 +82,23 @@ retry:
 	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
 	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
 	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	err = extract_build_id(buf, 256);
 
 	if (CHECK(err, "get build_id with readelf",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
 	if (CHECK(err, "get_next_key from stackmap",
 		  "err %d, errno %d\n", err, errno))
-		goto disable_pmu;
+		goto cleanup;
 
 	do {
 		char build_id[64];
@@ -121,7 +106,7 @@ retry:
 		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
 		if (CHECK(err, "lookup_elem from stackmap",
 			  "err %d, errno %d\n", err, errno))
-			goto disable_pmu;
+			goto cleanup;
 		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
 			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
 			    id_offs[i].offset != 0) {
@@ -139,8 +124,7 @@ retry:
 	 * try it one more time.
 	 */
 	if (build_id_matches < 1 && retry--) {
-		bpf_link__destroy(link);
-		bpf_object__close(obj);
+		test_stacktrace_build_id__destroy(skel);
 		printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
 		       __func__);
 		goto retry;
@@ -148,7 +132,7 @@ retry:
 
 	if (CHECK(build_id_matches < 1, "build id match",
 		  "Didn't find expected build ID from the map\n"))
-		goto disable_pmu;
+		goto cleanup;
 
 	/*
 	 * We intentionally skip compare_stack_ips(). This is because we
@@ -157,8 +141,6 @@ retry:
 	 * BPF_STACK_BUILD_ID_IP;
 	 */
 
-disable_pmu:
-	bpf_link__destroy(link);
-close_prog:
-	bpf_object__close(obj);
+cleanup:
+	test_stacktrace_build_id__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
new file mode 100644
index 000000000000..7185bee16fe4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_xdp_perf(void)
+{
+	const char *file = "./xdp_dummy.o";
+	__u32 duration, retval, size;
+	struct bpf_object *obj;
+	char in[128], out[128];
+	int err, prog_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	if (CHECK_FAIL(err))
+		return;
+
+	err = bpf_prog_test_run(prog_fd, 1000000, &in[0], 128,
+				out, &size, &retval, &duration);
+
+	CHECK(err || retval != XDP_PASS || size != 128,
+	      "xdp-perf",
+	      "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c
new file mode 100644
index 000000000000..65eac371b061
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___equiv_zero_sz_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c
new file mode 100644
index 000000000000..ecda2b545ac2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___err_bad_zero_sz_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c
new file mode 100644
index 000000000000..fe1d01232c22
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___fixed_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h
index 9311489e14b2..6d598cfbdb3e 100644
--- a/tools/testing/selftests/bpf/progs/core_reloc_types.h
+++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h
@@ -327,6 +327,7 @@ struct core_reloc_arrays_output {
 	char b123;
 	int c1c;
 	int d00d;
+	int f10c;
 };
 
 struct core_reloc_arrays_substruct {
@@ -339,6 +340,7 @@ struct core_reloc_arrays {
 	char b[2][3][4];
 	struct core_reloc_arrays_substruct c[3];
 	struct core_reloc_arrays_substruct d[1][2];
+	struct core_reloc_arrays_substruct f[][2];
 };
 
 /* bigger array dimensions */
@@ -347,6 +349,7 @@ struct core_reloc_arrays___diff_arr_dim {
 	char b[3][4][5];
 	struct core_reloc_arrays_substruct c[4];
 	struct core_reloc_arrays_substruct d[2][3];
+	struct core_reloc_arrays_substruct f[1][3];
 };
 
 /* different size of array's value (struct) */
@@ -363,6 +366,29 @@ struct core_reloc_arrays___diff_arr_val_sz {
 		int d;
 		int __padding2;
 	} d[1][2];
+	struct {
+		int __padding1;
+		int c;
+		int __padding2;
+	} f[][2];
+};
+
+struct core_reloc_arrays___equiv_zero_sz_arr {
+	int a[5];
+	char b[2][3][4];
+	struct core_reloc_arrays_substruct c[3];
+	struct core_reloc_arrays_substruct d[1][2];
+	/* equivalent to flexible array */
+	struct core_reloc_arrays_substruct f[0][2];
+};
+
+struct core_reloc_arrays___fixed_arr {
+	int a[5];
+	char b[2][3][4];
+	struct core_reloc_arrays_substruct c[3];
+	struct core_reloc_arrays_substruct d[1][2];
+	/* not a flexible array anymore, but within access bounds */
+	struct core_reloc_arrays_substruct f[1][2];
 };
 
 struct core_reloc_arrays___err_too_small {
@@ -370,6 +396,7 @@ struct core_reloc_arrays___err_too_small {
 	char b[2][3][4];
 	struct core_reloc_arrays_substruct c[3];
 	struct core_reloc_arrays_substruct d[1][2];
+	struct core_reloc_arrays_substruct f[][2];
 };
 
 struct core_reloc_arrays___err_too_shallow {
@@ -377,6 +404,7 @@ struct core_reloc_arrays___err_too_shallow {
 	char b[2][3]; /* this one lacks one dimension */
 	struct core_reloc_arrays_substruct c[3];
 	struct core_reloc_arrays_substruct d[1][2];
+	struct core_reloc_arrays_substruct f[][2];
 };
 
 struct core_reloc_arrays___err_non_array {
@@ -384,6 +412,7 @@ struct core_reloc_arrays___err_non_array {
 	char b[2][3][4];
 	struct core_reloc_arrays_substruct c[3];
 	struct core_reloc_arrays_substruct d[1][2];
+	struct core_reloc_arrays_substruct f[][2];
 };
 
 struct core_reloc_arrays___err_wrong_val_type {
@@ -391,6 +420,16 @@ struct core_reloc_arrays___err_wrong_val_type {
 	char b[2][3][4];
 	int c[3]; /* value is not a struct */
 	struct core_reloc_arrays_substruct d[1][2];
+	struct core_reloc_arrays_substruct f[][2];
+};
+
+struct core_reloc_arrays___err_bad_zero_sz_arr {
+	/* zero-sized array, but not at the end */
+	struct core_reloc_arrays_substruct f[0][2];
+	int a[5];
+	char b[2][3][4];
+	struct core_reloc_arrays_substruct c[3];
+	struct core_reloc_arrays_substruct d[1][2];
 };
 
 /*
diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c
index 534621e38906..221b69700625 100644
--- a/tools/testing/selftests/bpf/progs/test_attach_probe.c
+++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c
@@ -5,46 +5,36 @@
 #include <linux/bpf.h>
 #include "bpf_helpers.h"
 
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, 4);
-	__type(key, int);
-	__type(value, int);
-} results_map SEC(".maps");
+int kprobe_res = 0;
+int kretprobe_res = 0;
+int uprobe_res = 0;
+int uretprobe_res = 0;
 
 SEC("kprobe/sys_nanosleep")
-int handle_sys_nanosleep_entry(struct pt_regs *ctx)
+int handle_kprobe(struct pt_regs *ctx)
 {
-	const int key = 0, value = 1;
-
-	bpf_map_update_elem(&results_map, &key, &value, 0);
+	kprobe_res = 1;
 	return 0;
 }
 
 SEC("kretprobe/sys_nanosleep")
-int handle_sys_getpid_return(struct pt_regs *ctx)
+int handle_kretprobe(struct pt_regs *ctx)
 {
-	const int key = 1, value = 2;
-
-	bpf_map_update_elem(&results_map, &key, &value, 0);
+	kretprobe_res = 2;
 	return 0;
 }
 
 SEC("uprobe/trigger_func")
-int handle_uprobe_entry(struct pt_regs *ctx)
+int handle_uprobe(struct pt_regs *ctx)
 {
-	const int key = 2, value = 3;
-
-	bpf_map_update_elem(&results_map, &key, &value, 0);
+	uprobe_res = 3;
 	return 0;
 }
 
 SEC("uretprobe/trigger_func")
-int handle_uprobe_return(struct pt_regs *ctx)
+int handle_uretprobe(struct pt_regs *ctx)
 {
-	const int key = 3, value = 4;
-
-	bpf_map_update_elem(&results_map, &key, &value, 0);
+	uretprobe_res = 4;
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/test_core_extern.c b/tools/testing/selftests/bpf/progs/test_core_extern.c
new file mode 100644
index 000000000000..9bfc91d9d004
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_extern.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* non-existing BPF helper, to test dead code elimination */
+static int (*bpf_missing_helper)(const void *arg1, int arg2) = (void *) 999;
+
+extern int LINUX_KERNEL_VERSION __kconfig;
+extern bool CONFIG_BPF_SYSCALL __kconfig; /* strong */
+extern enum libbpf_tristate CONFIG_TRISTATE __kconfig __weak;
+extern bool CONFIG_BOOL __kconfig __weak;
+extern char CONFIG_CHAR __kconfig __weak;
+extern uint16_t CONFIG_USHORT __kconfig __weak;
+extern int CONFIG_INT __kconfig __weak;
+extern uint64_t CONFIG_ULONG __kconfig __weak;
+extern const char CONFIG_STR[8] __kconfig __weak;
+extern uint64_t CONFIG_MISSING __kconfig __weak;
+
+uint64_t kern_ver = -1;
+uint64_t bpf_syscall = -1;
+uint64_t tristate_val = -1;
+uint64_t bool_val = -1;
+uint64_t char_val = -1;
+uint64_t ushort_val = -1;
+uint64_t int_val = -1;
+uint64_t ulong_val = -1;
+char str_val[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+uint64_t missing_val = -1;
+
+SEC("raw_tp/sys_enter")
+int handle_sys_enter(struct pt_regs *ctx)
+{
+	int i;
+
+	kern_ver = LINUX_KERNEL_VERSION;
+	bpf_syscall = CONFIG_BPF_SYSCALL;
+	tristate_val = CONFIG_TRISTATE;
+	bool_val = CONFIG_BOOL;
+	char_val = CONFIG_CHAR;
+	ushort_val = CONFIG_USHORT;
+	int_val = CONFIG_INT;
+	ulong_val = CONFIG_ULONG;
+
+	for (i = 0; i < sizeof(CONFIG_STR); i++) {
+		str_val[i] = CONFIG_STR[i];
+	}
+
+	if (CONFIG_MISSING)
+		/* invalid, but dead code - never executed */
+		missing_val = bpf_missing_helper(ctx, 123);
+	else
+		missing_val = 0xDEADC0DE;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c
index 89951b684282..053b86f6b53f 100644
--- a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c
@@ -18,6 +18,7 @@ struct core_reloc_arrays_output {
 	char b123;
 	int c1c;
 	int d00d;
+	int f01c;
 };
 
 struct core_reloc_arrays_substruct {
@@ -30,6 +31,7 @@ struct core_reloc_arrays {
 	char b[2][3][4];
 	struct core_reloc_arrays_substruct c[3];
 	struct core_reloc_arrays_substruct d[1][2];
+	struct core_reloc_arrays_substruct f[][2];
 };
 
 #define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
@@ -40,18 +42,16 @@ int test_core_arrays(void *ctx)
 	struct core_reloc_arrays *in = (void *)&data.in;
 	struct core_reloc_arrays_output *out = (void *)&data.out;
 
-	/* in->a[2] */
 	if (CORE_READ(&out->a2, &in->a[2]))
 		return 1;
-	/* in->b[1][2][3] */
 	if (CORE_READ(&out->b123, &in->b[1][2][3]))
 		return 1;
-	/* in->c[1].c */
 	if (CORE_READ(&out->c1c, &in->c[1].c))
 		return 1;
-	/* in->d[0][0].d */
 	if (CORE_READ(&out->d00d, &in->d[0][0].d))
 		return 1;
+	if (CORE_READ(&out->f01c, &in->f[0][1].c))
+		return 1;
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c
index ea7d84f01235..b1f09f5bb1cf 100644
--- a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c
@@ -62,7 +62,7 @@ struct {
 	goto done;				\
 })
 
-SEC("select_by_skb_data")
+SEC("sk_reuseport")
 int _select_by_skb_data(struct sk_reuseport_md *reuse_md)
 {
 	__u32 linum, index = 0, flags = 0, index_zero = 0;
diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
index 2a9f4c736ebc..e18da87fe84f 100644
--- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c
+++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
@@ -17,6 +17,12 @@ int process(struct __sk_buff *skb)
 	}
 	skb->priority++;
 	skb->tstamp++;
+	skb->mark++;
+
+	if (skb->wire_len != 100)
+		return 1;
+	if (skb->gso_segs != 8)
+		return 1;
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c
new file mode 100644
index 000000000000..4f69aac5635f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_skeleton.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct s {
+	int a;
+	long long b;
+} __attribute__((packed));
+
+int in1 = 0;
+long long in2 = 0;
+char in3 = '\0';
+long long in4 __attribute__((aligned(64))) = 0;
+struct s in5 = {};
+
+long long out2 = 0;
+char out3 = 0;
+long long out4 = 0;
+int out1 = 0;
+
+extern bool CONFIG_BPF_SYSCALL __kconfig;
+extern int LINUX_KERNEL_VERSION __kconfig;
+bool bpf_syscall = 0;
+int kern_ver = 0;
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+	static volatile struct s out5;
+
+	out1 = in1;
+	out2 = in2;
+	out3 = in3;
+	out4 = in4;
+	out5 = in5;
+
+	bpf_syscall = CONFIG_BPF_SYSCALL;
+	kern_ver = LINUX_KERNEL_VERSION;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_cgroup_attach.c b/tools/testing/selftests/bpf/test_cgroup_attach.c
deleted file mode 100644
index 7671909ee1cb..000000000000
--- a/tools/testing/selftests/bpf/test_cgroup_attach.c
+++ /dev/null
@@ -1,571 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* eBPF example program:
- *
- * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
- *
- * - Loads eBPF program
- *
- *   The eBPF program accesses the map passed in to store two pieces of
- *   information. The number of invocations of the program, which maps
- *   to the number of packets received, is stored to key 0. Key 1 is
- *   incremented on each iteration by the number of bytes stored in
- *   the skb. The program also stores the number of received bytes
- *   in the cgroup storage.
- *
- * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
- *
- * - Every second, reads map[0] and map[1] to see how many bytes and
- *   packets were seen on any socket of tasks in the given cgroup.
- */
-
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <linux/filter.h>
-
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-
-#include "bpf_util.h"
-#include "bpf_rlimit.h"
-#include "cgroup_helpers.h"
-
-#define FOO		"/foo"
-#define BAR		"/foo/bar/"
-#define PING_CMD	"ping -q -c1 -w1 127.0.0.1 > /dev/null"
-
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-
-#ifdef DEBUG
-#define debug(args...) printf(args)
-#else
-#define debug(args...)
-#endif
-
-static int prog_load(int verdict)
-{
-	int ret;
-	struct bpf_insn prog[] = {
-		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
-		BPF_EXIT_INSN(),
-	};
-	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
-
-	ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
-			       prog, insns_cnt, "GPL", 0,
-			       bpf_log_buf, BPF_LOG_BUF_SIZE);
-
-	if (ret < 0) {
-		log_err("Loading program");
-		printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
-		return 0;
-	}
-	return ret;
-}
-
-static int test_foo_bar(void)
-{
-	int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
-
-	allow_prog = prog_load(1);
-	if (!allow_prog)
-		goto err;
-
-	drop_prog = prog_load(0);
-	if (!drop_prog)
-		goto err;
-
-	if (setup_cgroup_environment())
-		goto err;
-
-	/* Create cgroup /foo, get fd, and join it */
-	foo = create_and_get_cgroup(FOO);
-	if (foo < 0)
-		goto err;
-
-	if (join_cgroup(FOO))
-		goto err;
-
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_OVERRIDE)) {
-		log_err("Attaching prog to /foo");
-		goto err;
-	}
-
-	debug("Attached DROP prog. This ping in cgroup /foo should fail...\n");
-	assert(system(PING_CMD) != 0);
-
-	/* Create cgroup /foo/bar, get fd, and join it */
-	bar = create_and_get_cgroup(BAR);
-	if (bar < 0)
-		goto err;
-
-	if (join_cgroup(BAR))
-		goto err;
-
-	debug("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
-	assert(system(PING_CMD) != 0);
-
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_OVERRIDE)) {
-		log_err("Attaching prog to /foo/bar");
-		goto err;
-	}
-
-	debug("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n");
-	assert(system(PING_CMD) == 0);
-
-	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Detaching program from /foo/bar");
-		goto err;
-	}
-
-	debug("Detached PASS from /foo/bar while DROP is attached to /foo.\n"
-	       "This ping in cgroup /foo/bar should fail...\n");
-	assert(system(PING_CMD) != 0);
-
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_OVERRIDE)) {
-		log_err("Attaching prog to /foo/bar");
-		goto err;
-	}
-
-	if (bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Detaching program from /foo");
-		goto err;
-	}
-
-	debug("Attached PASS from /foo/bar and detached DROP from /foo.\n"
-	       "This ping in cgroup /foo/bar should pass...\n");
-	assert(system(PING_CMD) == 0);
-
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_OVERRIDE)) {
-		log_err("Attaching prog to /foo/bar");
-		goto err;
-	}
-
-	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
-		errno = 0;
-		log_err("Unexpected success attaching prog to /foo/bar");
-		goto err;
-	}
-
-	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Detaching program from /foo/bar");
-		goto err;
-	}
-
-	if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
-		errno = 0;
-		log_err("Unexpected success in double detach from /foo");
-		goto err;
-	}
-
-	if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
-		log_err("Attaching non-overridable prog to /foo");
-		goto err;
-	}
-
-	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
-		errno = 0;
-		log_err("Unexpected success attaching non-overridable prog to /foo/bar");
-		goto err;
-	}
-
-	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
-			     BPF_F_ALLOW_OVERRIDE)) {
-		errno = 0;
-		log_err("Unexpected success attaching overridable prog to /foo/bar");
-		goto err;
-	}
-
-	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
-			     BPF_F_ALLOW_OVERRIDE)) {
-		errno = 0;
-		log_err("Unexpected success attaching overridable prog to /foo");
-		goto err;
-	}
-
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
-		log_err("Attaching different non-overridable prog to /foo");
-		goto err;
-	}
-
-	goto out;
-
-err:
-	rc = 1;
-
-out:
-	close(foo);
-	close(bar);
-	cleanup_cgroup_environment();
-	if (!rc)
-		printf("#override:PASS\n");
-	else
-		printf("#override:FAIL\n");
-	return rc;
-}
-
-static int map_fd = -1;
-
-static int prog_load_cnt(int verdict, int val)
-{
-	int cgroup_storage_fd, percpu_cgroup_storage_fd;
-
-	if (map_fd < 0)
-		map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
-	if (map_fd < 0) {
-		printf("failed to create map '%s'\n", strerror(errno));
-		return -1;
-	}
-
-	cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE,
-				sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
-	if (cgroup_storage_fd < 0) {
-		printf("failed to create map '%s'\n", strerror(errno));
-		return -1;
-	}
-
-	percpu_cgroup_storage_fd = bpf_create_map(
-		BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
-		sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
-	if (percpu_cgroup_storage_fd < 0) {
-		printf("failed to create map '%s'\n", strerror(errno));
-		return -1;
-	}
-
-	struct bpf_insn prog[] = {
-		BPF_MOV32_IMM(BPF_REG_0, 0),
-		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
-		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
-		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
-		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
-		BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
-
-		BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd),
-		BPF_MOV64_IMM(BPF_REG_2, 0),
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
-		BPF_MOV64_IMM(BPF_REG_1, val),
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0),
-
-		BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd),
-		BPF_MOV64_IMM(BPF_REG_2, 0),
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
-		BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
-		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
-		BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
-
-		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
-		BPF_EXIT_INSN(),
-	};
-	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
-	int ret;
-
-	ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
-			       prog, insns_cnt, "GPL", 0,
-			       bpf_log_buf, BPF_LOG_BUF_SIZE);
-
-	if (ret < 0) {
-		log_err("Loading program");
-		printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
-		return 0;
-	}
-	close(cgroup_storage_fd);
-	return ret;
-}
-
-
-static int test_multiprog(void)
-{
-	__u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
-	int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
-	int drop_prog, allow_prog[6] = {}, rc = 0;
-	unsigned long long value;
-	int i = 0;
-
-	for (i = 0; i < 6; i++) {
-		allow_prog[i] = prog_load_cnt(1, 1 << i);
-		if (!allow_prog[i])
-			goto err;
-	}
-	drop_prog = prog_load_cnt(0, 1);
-	if (!drop_prog)
-		goto err;
-
-	if (setup_cgroup_environment())
-		goto err;
-
-	cg1 = create_and_get_cgroup("/cg1");
-	if (cg1 < 0)
-		goto err;
-	cg2 = create_and_get_cgroup("/cg1/cg2");
-	if (cg2 < 0)
-		goto err;
-	cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
-	if (cg3 < 0)
-		goto err;
-	cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
-	if (cg4 < 0)
-		goto err;
-	cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
-	if (cg5 < 0)
-		goto err;
-
-	if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
-		goto err;
-
-	if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_MULTI)) {
-		log_err("Attaching prog to cg1");
-		goto err;
-	}
-	if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
-			     BPF_F_ALLOW_MULTI)) {
-		log_err("Unexpected success attaching the same prog to cg1");
-		goto err;
-	}
-	if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_MULTI)) {
-		log_err("Attaching prog2 to cg1");
-		goto err;
-	}
-	if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_OVERRIDE)) {
-		log_err("Attaching prog to cg2");
-		goto err;
-	}
-	if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_MULTI)) {
-		log_err("Attaching prog to cg3");
-		goto err;
-	}
-	if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
-			    BPF_F_ALLOW_OVERRIDE)) {
-		log_err("Attaching prog to cg4");
-		goto err;
-	}
-	if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) {
-		log_err("Attaching prog to cg5");
-		goto err;
-	}
-	assert(system(PING_CMD) == 0);
-	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
-	assert(value == 1 + 2 + 8 + 32);
-
-	/* query the number of effective progs in cg5 */
-	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
-			      NULL, NULL, &prog_cnt) == 0);
-	assert(prog_cnt == 4);
-	/* retrieve prog_ids of effective progs in cg5 */
-	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
-			      &attach_flags, prog_ids, &prog_cnt) == 0);
-	assert(prog_cnt == 4);
-	assert(attach_flags == 0);
-	saved_prog_id = prog_ids[0];
-	/* check enospc handling */
-	prog_ids[0] = 0;
-	prog_cnt = 2;
-	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
-			      &attach_flags, prog_ids, &prog_cnt) == -1 &&
-	       errno == ENOSPC);
-	assert(prog_cnt == 4);
-	/* check that prog_ids are returned even when buffer is too small */
-	assert(prog_ids[0] == saved_prog_id);
-	/* retrieve prog_id of single attached prog in cg5 */
-	prog_ids[0] = 0;
-	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
-			      NULL, prog_ids, &prog_cnt) == 0);
-	assert(prog_cnt == 1);
-	assert(prog_ids[0] == saved_prog_id);
-
-	/* detach bottom program and ping again */
-	if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Detaching prog from cg5");
-		goto err;
-	}
-	value = 0;
-	assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
-	assert(system(PING_CMD) == 0);
-	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
-	assert(value == 1 + 2 + 8 + 16);
-
-	/* detach 3rd from bottom program and ping again */
-	errno = 0;
-	if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Unexpected success on detach from cg3");
-		goto err;
-	}
-	if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Detaching from cg3");
-		goto err;
-	}
-	value = 0;
-	assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
-	assert(system(PING_CMD) == 0);
-	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
-	assert(value == 1 + 2 + 16);
-
-	/* detach 2nd from bottom program and ping again */
-	if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) {
-		log_err("Detaching prog from cg4");
-		goto err;
-	}
-	value = 0;
-	assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
-	assert(system(PING_CMD) == 0);
-	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
-	assert(value == 1 + 2 + 4);
-
-	prog_cnt = 4;
-	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
-			      &attach_flags, prog_ids, &prog_cnt) == 0);
-	assert(prog_cnt == 3);
-	assert(attach_flags == 0);
-	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
-			      NULL, prog_ids, &prog_cnt) == 0);
-	assert(prog_cnt == 0);
-	goto out;
-err:
-	rc = 1;
-
-out:
-	for (i = 0; i < 6; i++)
-		if (allow_prog[i] > 0)
-			close(allow_prog[i]);
-	close(cg1);
-	close(cg2);
-	close(cg3);
-	close(cg4);
-	close(cg5);
-	cleanup_cgroup_environment();
-	if (!rc)
-		printf("#multi:PASS\n");
-	else
-		printf("#multi:FAIL\n");
-	return rc;
-}
-
-static int test_autodetach(void)
-{
-	__u32 prog_cnt = 4, attach_flags;
-	int allow_prog[2] = {0};
-	__u32 prog_ids[2] = {0};
-	int cg = 0, i, rc = -1;
-	void *ptr = NULL;
-	int attempts;
-
-	for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
-		allow_prog[i] = prog_load_cnt(1, 1 << i);
-		if (!allow_prog[i])
-			goto err;
-	}
-
-	if (setup_cgroup_environment())
-		goto err;
-
-	/* create a cgroup, attach two programs and remember their ids */
-	cg = create_and_get_cgroup("/cg_autodetach");
-	if (cg < 0)
-		goto err;
-
-	if (join_cgroup("/cg_autodetach"))
-		goto err;
-
-	for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
-		if (bpf_prog_attach(allow_prog[i], cg, BPF_CGROUP_INET_EGRESS,
-				    BPF_F_ALLOW_MULTI)) {
-			log_err("Attaching prog[%d] to cg:egress", i);
-			goto err;
-		}
-	}
-
-	/* make sure that programs are attached and run some traffic */
-	assert(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags,
-			      prog_ids, &prog_cnt) == 0);
-	assert(system(PING_CMD) == 0);
-
-	/* allocate some memory (4Mb) to pin the original cgroup */
-	ptr = malloc(4 * (1 << 20));
-	if (!ptr)
-		goto err;
-
-	/* close programs and cgroup fd */
-	for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
-		close(allow_prog[i]);
-		allow_prog[i] = 0;
-	}
-
-	close(cg);
-	cg = 0;
-
-	/* leave the cgroup and remove it. don't detach programs */
-	cleanup_cgroup_environment();
-
-	/* wait for the asynchronous auto-detachment.
-	 * wait for no more than 5 sec and give up.
-	 */
-	for (i = 0; i < ARRAY_SIZE(prog_ids); i++) {
-		for (attempts = 5; attempts >= 0; attempts--) {
-			int fd = bpf_prog_get_fd_by_id(prog_ids[i]);
-
-			if (fd < 0)
-				break;
-
-			/* don't leave the fd open */
-			close(fd);
-
-			if (!attempts)
-				goto err;
-
-			sleep(1);
-		}
-	}
-
-	rc = 0;
-err:
-	for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
-		if (allow_prog[i] > 0)
-			close(allow_prog[i]);
-	if (cg)
-		close(cg);
-	free(ptr);
-	cleanup_cgroup_environment();
-	if (!rc)
-		printf("#autodetach:PASS\n");
-	else
-		printf("#autodetach:FAIL\n");
-	return rc;
-}
-
-int main(void)
-{
-	int (*tests[])(void) = {
-		test_foo_bar,
-		test_multiprog,
-		test_autodetach,
-	};
-	int errors = 0;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(tests); i++)
-		if (tests[i]())
-			errors++;
-
-	if (errors)
-		printf("test_cgroup_attach:FAIL\n");
-	else
-		printf("test_cgroup_attach:PASS\n");
-
-	return errors ? EXIT_FAILURE : EXIT_SUCCESS;
-}
diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp
index f0eb2727b766..6fe23a10d48a 100644
--- a/tools/testing/selftests/bpf/test_cpp.cpp
+++ b/tools/testing/selftests/bpf/test_cpp.cpp
@@ -1,12 +1,16 @@
 /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#include <iostream>
 #include "libbpf.h"
 #include "bpf.h"
 #include "btf.h"
+#include "test_core_extern.skel.h"
 
 /* do nothing, just make sure we can link successfully */
 
 int main(int argc, char *argv[])
 {
+	struct test_core_extern *skel;
+
 	/* libbpf.h */
 	libbpf_set_print(NULL);
 
@@ -16,5 +20,11 @@ int main(int argc, char *argv[])
 	/* btf.h */
 	btf__new(NULL, 0);
 
+	/* BPF skeleton */
+	skel = test_core_extern__open_and_load();
+	test_core_extern__destroy(skel);
+
+	std::cout << "DONE!" << std::endl;
+
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 8477df835979..de1fdaa4e7b4 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -100,6 +100,7 @@ extern struct ipv6_packet pkt_v6;
 
 #define _CHECK(condition, tag, duration, format...) ({			\
 	int __ret = !!(condition);					\
+	int __save_errno = errno;					\
 	if (__ret) {							\
 		test__fail();						\
 		printf("%s:FAIL:%s ", __func__, tag);			\
@@ -108,15 +109,18 @@ extern struct ipv6_packet pkt_v6;
 		printf("%s:PASS:%s %d nsec\n",				\
 		       __func__, tag, duration);			\
 	}								\
+	errno = __save_errno;						\
 	__ret;								\
 })
 
 #define CHECK_FAIL(condition) ({					\
 	int __ret = !!(condition);					\
+	int __save_errno = errno;					\
 	if (__ret) {							\
 		test__fail();						\
 		printf("%s:FAIL:%d\n", __func__, __LINE__);		\
 	}								\
+	errno = __save_errno;						\
 	__ret;								\
 })