17 files changed, 928 insertions, 45 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 99f76103c6b7..2ca777635d8e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
+#include <linux/bpf.h>
 
 int bpf_jit_enable __read_mostly;
 
@@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 	return ptr + len;
 }
 
-#define EMIT(bytes, len)	do { prog = emit_code(prog, bytes, len); } while (0)
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
 
 #define EMIT1(b1)		EMIT(b1, 1)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
@@ -186,31 +188,31 @@ struct jit_context {
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
-		  int oldproglen, struct jit_context *ctx)
+#define STACKSIZE \
+	(MAX_BPF_STACK + \
+	 32 /* space for rbx, r13, r14, r15 */ + \
+	 8 /* space for skb_copy_bits() buffer */)
+
+#define PROLOGUE_SIZE 51
+
+/* emit x64 prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog)
 {
-	struct bpf_insn *insn = bpf_prog->insnsi;
-	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_exit = false;
-	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
-	int i;
-	int proglen = 0;
-	u8 *prog = temp;
-	int stacksize = MAX_BPF_STACK +
-		32 /* space for rbx, r13, r14, r15 */ +
-		8 /* space for skb_copy_bits() buffer */;
+	u8 *prog = *pprog;
+	int cnt = 0;
 
 	EMIT1(0x55); /* push rbp */
 	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
 
-	/* sub rsp, stacksize */
-	EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+	/* sub rsp, STACKSIZE */
+	EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);
 
 	/* all classic BPF filters use R6(rbx) save it */
 
 	/* mov qword ptr [rbp-X],rbx */
-	EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+	EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);
 
 	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
 	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	 */
 
 	/* mov qword ptr [rbp-X],r13 */
-	EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+	EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
 	/* mov qword ptr [rbp-X],r14 */
-	EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+	EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
 	/* mov qword ptr [rbp-X],r15 */
-	EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+	EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
 
 	/* clear A and X registers */
 	EMIT2(0x31, 0xc0); /* xor eax, eax */
 	EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
 
+	/* clear tail_cnt: mov qword ptr [rbp-X], rax */
+	EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->prog[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int label1, label2, label3;
+	int cnt = 0;
+
+	/* rdi - pointer to ctx
+	 * rsi - pointer to bpf_array
+	 * rdx - index in bpf_array
+	 */
+
+	/* if (index >= array->map.max_entries)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	      offsetof(struct bpf_array, map.max_entries));
+	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
+#define OFFSET1 44 /* number of bytes to jump */
+	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
+	label1 = cnt;
+
+	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *   goto out;
+	 */
+	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
+	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
+#define OFFSET2 33
+	EMIT2(X86_JA, OFFSET2);                   /* ja out */
+	label2 = cnt;
+	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
+	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
+
+	/* prog = array->prog[index]; */
+	EMIT4(0x48, 0x8D, 0x44, 0xD6);            /* lea rax, [rsi + rdx * 8 + 0x50] */
+	EMIT1(offsetof(struct bpf_array, prog));
+	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
+
+	/* if (prog == NULL)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x83, 0xF8, 0x00);            /* cmp rax, 0 */
+#define OFFSET3 10
+	EMIT2(X86_JE, OFFSET3);                   /* je out */
+	label3 = cnt;
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	EMIT4(0x48, 0x8B, 0x40,                   /* mov rax, qword ptr [rax + 32] */
+	      offsetof(struct bpf_prog, bpf_func));
+	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
+
+	/* now we're ready to jump into next BPF program
+	 * rdi == ctx (1st arg)
+	 * rax == prog->bpf_func + prologue_size
+	 */
+	EMIT2(0xFF, 0xE0);                        /* jmp rax */
+
+	/* out: */
+	BUILD_BUG_ON(cnt - label1 != OFFSET1);
+	BUILD_BUG_ON(cnt - label2 != OFFSET2);
+	BUILD_BUG_ON(cnt - label3 != OFFSET3);
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog);
+
 	if (seen_ld_abs) {
 		/* r9d : skb->len - skb->data_len (headlen)
 		 * r10 : skb->data
@@ -739,6 +837,10 @@ xadd:			if (is_imm8(insn->off))
 			}
 			break;
 
+		case BPF_JMP | BPF_CALL | BPF_X:
+			emit_bpf_tail_call(&prog);
+			break;
+
 			/* cond jump */
 		case BPF_JMP | BPF_JEQ | BPF_X:
 		case BPF_JMP | BPF_JNE | BPF_X:
@@ -891,13 +993,13 @@ common_load:
 			/* update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp-X] */
-			EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+			EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
 			/* mov r13, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+			EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
 			/* mov r14, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+			EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
 			/* mov r15, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+			EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);
 
 			EMIT1(0xC9); /* leave */
 			EMIT1(0xC3); /* ret */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d5cda067115a..8821b9a8689e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -126,6 +126,27 @@ struct bpf_prog_aux {
 	struct work_struct work;
 };
 
+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	/* 'ownership' of prog_array is claimed by the first program that
+	 * is going to use this map or by the first program which FD is stored
+	 * in the map to make sure that all callers and callees have the same
+	 * prog_type and JITed flag
+	 */
+	enum bpf_prog_type owner_prog_type;
+	bool owner_jited;
+	union {
+		char value[0] __aligned(8);
+		struct bpf_prog *prog[0] __aligned(8);
+	};
+};
+#define MAX_TAIL_CALL_CNT 32
+
+u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+void bpf_prog_array_map_clear(struct bpf_map *map);
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
@@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_tail_call_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 200be4a74a33..17724f6ea983 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-void bpf_prog_select_runtime(struct bpf_prog *fp);
+int bpf_prog_select_runtime(struct bpf_prog *fp);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9ebdf5701e8..f0a9af8b4dae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
 };
 
 enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_l4_csum_replace,
+
+	/**
+	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
+	 * @ctx: context pointer passed to next program
+	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+	 * @index: index inside array that selects specific program to run
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_tail_call,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583f38..614bcd4c1d74 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-
-struct bpf_array {
-	struct bpf_map map;
-	u32 elem_size;
-	char value[0] __aligned(8);
-};
+#include <linux/filter.h>
 
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
 	return 0;
 }
 late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	/* only bpf_prog file descriptors can be stored in prog_array map */
+	if (attr->value_size != sizeof(u32))
+		return ERR_PTR(-EINVAL);
+	return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* make sure it's empty */
+	for (i = 0; i < array->map.max_entries; i++)
+		BUG_ON(array->prog[i] != NULL);
+	kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+				      void *value, u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog, *old_prog;
+	u32 index = *(u32 *)key, ufd;
+
+	if (map_flags != BPF_ANY)
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	ufd = *(u32 *)value;
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	old_prog = xchg(array->prog + index, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *old_prog;
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	old_prog = xchg(array->prog + index, NULL);
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = prog_array_map_lookup_elem,
+	.map_update_elem = prog_array_map_update_elem,
+	.map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+	.ops = &prog_array_ops,
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+	bpf_register_map_type(&prog_array_type);
+	return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd0e2..d44b25cbe460 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }
 
+const struct bpf_func_proto bpf_tail_call_proto = {
+	.func = NULL,
+	.gpl_only = false,
+	.ret_type = RET_VOID,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_CONST_MAP_PTR,
+	.arg3_type = ARG_ANYTHING,
+};
+
 /**
  *	__bpf_prog_run - run eBPF program on a given context
  *	@ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
+	u32 tail_call_cnt = 0;
 	void *ptr;
 	int off;
 
@@ -431,6 +442,30 @@ select_insn:
 						       BPF_R4, BPF_R5);
 		CONT;
 
+	JMP_TAIL_CALL: {
+		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+		struct bpf_prog *prog;
+		u64 index = BPF_R3;
+
+		if (unlikely(index >= array->map.max_entries))
+			goto out;
+
+		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+			goto out;
+
+		tail_call_cnt++;
+
+		prog = READ_ONCE(array->prog[index]);
+		if (unlikely(!prog))
+			goto out;
+
+		ARG1 = BPF_R1;
+		insn = prog->insnsi;
+		goto select_insn;
+out:
+		CONT;
+	}
 	/* JMP */
 	JMP_JA:
 		insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }
 
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
+{
+	if (array->owner_prog_type) {
+		if (array->owner_prog_type != fp->type)
+			return false;
+		if (array->owner_jited != fp->jited)
+			return false;
+	} else {
+		array->owner_prog_type = fp->type;
+		array->owner_jited = fp->jited;
+	}
+	return true;
+}
+
+static int check_tail_call(const struct bpf_prog *fp)
+{
+	struct bpf_prog_aux *aux = fp->aux;
+	int i;
+
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		struct bpf_array *array;
+		struct bpf_map *map;
+
+		map = aux->used_maps[i];
+		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+			continue;
+		array = container_of(map, struct bpf_array, map);
+		if (!bpf_prog_array_compatible(array, fp))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  *	bpf_prog_select_runtime - select execution runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
  * try to JIT internal BPF program, if JIT is not available select interpreter
  * BPF program will be executed via BPF_PROG_RUN() macro
  */
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
 	bpf_int_jit_compile(fp);
 	/* Lock whole bpf_prog as read-only */
 	bpf_prog_lock_ro(fp);
+
+	return check_tail_call(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591914..98a69bd83069 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_map *map = filp->private_data;
 
+	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+		/* prog_array stores refcnt-ed bpf_prog pointers
+		 * release them all when user space closes prog_array_fd
+		 */
+		bpf_prog_array_map_clear(map);
+
 	bpf_map_put(map);
 	return 0;
 }
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);
 
+			if (insn->imm == BPF_FUNC_tail_call) {
+				/* mark bpf_tail_call as different opcode
+				 * to avoid conditional branch in
+				 * interpeter for every normal call
+				 * and to prevent accidental JITing by
+				 * JIT compiler that doesn't support
+				 * bpf_tail_call yet
+				 */
+				insn->imm = 0;
+				insn->code |= BPF_X;
+				continue;
+			}
+
 			fn = prog->aux->ops->get_func_proto(insn->imm);
 			/* all functions that have prototype and verifier allowed
 			 * programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	bpf_prog_select_runtime(prog);
+	err = bpf_prog_select_runtime(prog);
+	if (err < 0)
+		goto free_used_maps;
 
 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6e23..cfd9a40b9a5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
+
+	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+	    func_id != BPF_FUNC_tail_call)
+		/* prog_array map type needs extra care:
+		 * only allow to pass it into bpf_tail_call() for now.
+		 * bpf_map_delete_elem() can be allowed in the future,
+		 * while bpf_map_update_elem() must only be done via syscall
+		 */
+		return -EINVAL;
+
+	if (func_id == BPF_FUNC_tail_call &&
+	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+		/* don't allow any other map type to be passed into
+		 * bpf_tail_call()
+		 */
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce501632..646445e41bd4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 
 	case BPF_FUNC_trace_printk:
 		/*
diff --git a/net/core/filter.c b/net/core/filter.c
index 6805717be614..3adcca6f17a4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 	default:
 		return NULL;
 	}
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8fdbd73429dd..46c6a8cf74d3 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,29 +6,35 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += sockex3
 hostprogs-y += tracex1
 hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex4
+hostprogs-y += tracex5
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+sockex3-objs := bpf_load.o libbpf.o sockex3_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
+tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += sockex3_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
+always += tracex5_kern.o
 always += tcbpf1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
@@ -36,10 +42,12 @@ HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_sockex3 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
+HOSTLOADLIBES_tracex5 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f960b5fb3ed8..f531a0b3282d 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -21,6 +21,10 @@ static unsigned long long (*bpf_ktime_get_ns)(void) =
 	(void *) BPF_FUNC_ktime_get_ns;
 static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *) BPF_FUNC_trace_printk;
+static void (*bpf_tail_call)(void *ctx, void *map, int index) =
+	(void *) BPF_FUNC_tail_call;
+static unsigned long long (*bpf_get_smp_processor_id)(void) =
+	(void *) BPF_FUNC_get_smp_processor_id;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 38dac5a53b51..da86a8e0a95a 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -16,6 +16,7 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <poll.h>
+#include <ctype.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
@@ -29,6 +30,19 @@ int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
 int event_fd[MAX_PROGS];
 int prog_cnt;
+int prog_array_fd = -1;
+
+static int populate_prog_array(const char *event, int prog_fd)
+{
+	int ind = atoi(event), err;
+
+	err = bpf_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
+	if (err < 0) {
+		printf("failed to store prog_fd in prog_array\n");
+		return -1;
+	}
+	return 0;
+}
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
@@ -54,12 +68,40 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		return -1;
 	}
 
+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
+	if (fd < 0) {
+		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
+		return -1;
+	}
+
+	prog_fd[prog_cnt++] = fd;
+
+	if (is_socket) {
+		event += 6;
+		if (*event != '/')
+			return 0;
+		event++;
+		if (!isdigit(*event)) {
+			printf("invalid prog number\n");
+			return -1;
+		}
+		return populate_prog_array(event, fd);
+	}
+
 	if (is_kprobe || is_kretprobe) {
 		if (is_kprobe)
 			event += 7;
 		else
 			event += 10;
 
+		if (*event == 0) {
+			printf("event name cannot be empty\n");
+			return -1;
+		}
+
+		if (isdigit(*event))
+			return populate_prog_array(event, fd);
+
 		snprintf(buf, sizeof(buf),
 			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
 			 is_kprobe ? 'p' : 'r', event, event);
@@ -71,18 +113,6 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		}
 	}
 
-	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
-
-	if (fd < 0) {
-		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
-		return -1;
-	}
-
-	prog_fd[prog_cnt++] = fd;
-
-	if (is_socket)
-		return 0;
-
 	strcpy(buf, DEBUGFS);
 	strcat(buf, "events/kprobes/");
 	strcat(buf, event);
@@ -130,6 +160,9 @@ static int load_maps(struct bpf_map_def *maps, int len)
 					   maps[i].max_entries);
 		if (map_fd[i] < 0)
 			return 1;
+
+		if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
+			prog_array_fd = map_fd[i];
 	}
 	return 0;
 }
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
new file mode 100644
index 000000000000..2625b987944f
--- /dev/null
+++ b/samples/bpf/sockex3_kern.c
@@ -0,0 +1,303 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#include <uapi/linux/mpls.h>
+#define IP_MF		0x2000
+#define IP_OFFSET	0x1FFF
+
+#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
+
+struct bpf_map_def SEC("maps") jmp_table = {
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u32),
+	.max_entries = 8,
+};
+
+#define PARSE_VLAN 1
+#define PARSE_MPLS 2
+#define PARSE_IP 3
+#define PARSE_IPV6 4
+
+/* protocol dispatch routine.
+ * It tail-calls next BPF program depending on eth proto
+ * Note, we could have used:
+ * bpf_tail_call(skb, &jmp_table, proto);
+ * but it would need large prog_array
+ */
+static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
+{
+	switch (proto) {
+	case ETH_P_8021Q:
+	case ETH_P_8021AD:
+		bpf_tail_call(skb, &jmp_table, PARSE_VLAN);
+		break;
+	case ETH_P_MPLS_UC:
+	case ETH_P_MPLS_MC:
+		bpf_tail_call(skb, &jmp_table, PARSE_MPLS);
+		break;
+	case ETH_P_IP:
+		bpf_tail_call(skb, &jmp_table, PARSE_IP);
+		break;
+	case ETH_P_IPV6:
+		bpf_tail_call(skb, &jmp_table, PARSE_IPV6);
+		break;
+	}
+}
+
+struct vlan_hdr {
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_keys {
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	__u32 ip_proto;
+};
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+	return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+		& (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
+{
+	__u64 w0 = load_word(ctx, off);
+	__u64 w1 = load_word(ctx, off + 4);
+	__u64 w2 = load_word(ctx, off + 8);
+	__u64 w3 = load_word(ctx, off + 12);
+
+	return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+struct globals {
+	struct flow_keys flow;
+	__u32 nhoff;
+};
+
+struct bpf_map_def SEC("maps") percpu_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct globals),
+	.max_entries = 32,
+};
+
+/* user poor man's per_cpu until native support is ready */
+static struct globals *this_cpu_globals(void)
+{
+	u32 key = bpf_get_smp_processor_id();
+
+	return bpf_map_lookup_elem(&percpu_map, &key);
+}
+
+/* some simple stats for user space consumption */
+struct pair {
+	__u64 packets;
+	__u64 bytes;
+};
+
+struct bpf_map_def SEC("maps") hash_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(struct flow_keys),
+	.value_size = sizeof(struct pair),
+	.max_entries = 1024,
+};
+
+static void update_stats(struct __sk_buff *skb, struct globals *g)
+{
+	struct flow_keys key = g->flow;
+	struct pair *value;
+
+	value = bpf_map_lookup_elem(&hash_map, &key);
+	if (value) {
+		__sync_fetch_and_add(&value->packets, 1);
+		__sync_fetch_and_add(&value->bytes, skb->len);
+	} else {
+		struct pair val = {1, skb->len};
+
+		bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+	}
+}
+
+static __always_inline void parse_ip_proto(struct __sk_buff *skb,
+					   struct globals *g, __u32 ip_proto)
+{
+	__u32 nhoff = g->nhoff;
+	int poff;
+
+	switch (ip_proto) {
+	case IPPROTO_GRE: {
+		struct gre_hdr {
+			__be16 flags;
+			__be16 proto;
+		};
+
+		__u32 gre_flags = load_half(skb,
+					    nhoff + offsetof(struct gre_hdr, flags));
+		__u32 gre_proto = load_half(skb,
+					    nhoff + offsetof(struct gre_hdr, proto));
+
+		if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+			break;
+
+		nhoff += 4;
+		if (gre_flags & GRE_CSUM)
+			nhoff += 4;
+		if (gre_flags & GRE_KEY)
+			nhoff += 4;
+		if (gre_flags & GRE_SEQ)
+			nhoff += 4;
+
+		g->nhoff = nhoff;
+		parse_eth_proto(skb, gre_proto);
+		break;
+	}
+	case IPPROTO_IPIP:
+		parse_eth_proto(skb, ETH_P_IP);
+		break;
+	case IPPROTO_IPV6:
+		parse_eth_proto(skb, ETH_P_IPV6);
+		break;
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		g->flow.ports = load_word(skb, nhoff);
+	case IPPROTO_ICMP:
+		g->flow.ip_proto = ip_proto;
+		update_stats(skb, g);
+		break;
+	default:
+		break;
+	}
+}
+
+PROG(PARSE_IP)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, verlen, ip_proto;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	if (unlikely(ip_is_fragment(skb, nhoff)))
+		return 0;
+
+	ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+	if (ip_proto != IPPROTO_GRE) {
+		g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+		g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+	}
+
+	verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+	nhoff += (verlen & 0xF) << 2;
+
+	g->nhoff = nhoff;
+	parse_ip_proto(skb, g, ip_proto);
+	return 0;
+}
+
+PROG(PARSE_IPV6)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, ip_proto;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	ip_proto = load_byte(skb,
+			     nhoff + offsetof(struct ipv6hdr, nexthdr));
+	g->flow.src = ipv6_addr_hash(skb,
+				     nhoff + offsetof(struct ipv6hdr, saddr));
+	g->flow.dst = ipv6_addr_hash(skb,
+				     nhoff + offsetof(struct ipv6hdr, daddr));
+	nhoff += sizeof(struct ipv6hdr);
+
+	g->nhoff = nhoff;
+	parse_ip_proto(skb, g, ip_proto);
+	return 0;
+}
+
+PROG(PARSE_VLAN)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, proto;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+						h_vlan_encapsulated_proto));
+	nhoff += sizeof(struct vlan_hdr);
+	g->nhoff = nhoff;
+
+	parse_eth_proto(skb, proto);
+
+	return 0;
+}
+
+PROG(PARSE_MPLS)(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff, label;
+
+	if (!g)
+		return 0;
+
+	nhoff = g->nhoff;
+
+	label = load_word(skb, nhoff);
+	nhoff += sizeof(struct mpls_label);
+	g->nhoff = nhoff;
+
+	if (label & MPLS_LS_S_MASK) {
+		__u8 verlen = load_byte(skb, nhoff);
+		if ((verlen & 0xF0) == 4)
+			parse_eth_proto(skb, ETH_P_IP);
+		else
+			parse_eth_proto(skb, ETH_P_IPV6);
+	} else {
+		parse_eth_proto(skb, ETH_P_MPLS_UC);
+	}
+
+	return 0;
+}
+
+SEC("socket/0")
+int main_prog(struct __sk_buff *skb)
+{
+	struct globals *g = this_cpu_globals();
+	__u32 nhoff = ETH_HLEN;
+	__u32 proto = load_half(skb, 12);
+
+	if (!g)
+		return 0;
+
+	g->nhoff = nhoff;
+	parse_eth_proto(skb, proto);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
new file mode 100644
index 000000000000..2617772d060d
--- /dev/null
+++ b/samples/bpf/sockex3_user.c
@@ -0,0 +1,66 @@
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+struct flow_keys {
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	__u32 ip_proto;
+};
+
+struct pair {
+	__u64 packets;
+	__u64 bytes;
+};
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+	FILE *f;
+	int i, sock;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	sock = open_raw_sock("lo");
+
+	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
+			  sizeof(__u32)) == 0);
+
+	if (argc > 1)
+		f = popen("ping -c5 localhost", "r");
+	else
+		f = popen("netperf -l 4 localhost", "r");
+	(void) f;
+
+	for (i = 0; i < 5; i++) {
+		struct flow_keys key = {}, next_key;
+		struct pair value;
+
+		sleep(1);
+		printf("IP     src.port -> dst.port               bytes      packets\n");
+		while (bpf_get_next_key(map_fd[2], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[2], &next_key, &value);
+			printf("%s.%05d -> %s.%05d %12lld %12lld\n",
+			       inet_ntoa((struct in_addr){htonl(next_key.src)}),
+			       next_key.port16[0],
+			       inet_ntoa((struct in_addr){htonl(next_key.dst)}),
+			       next_key.port16[1],
+			       value.bytes, value.packets);
+			key = next_key;
+		}
+	}
+	return 0;
+}
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
new file mode 100644
index 000000000000..b71fe07a7a7a
--- /dev/null
+++ b/samples/bpf/tracex5_kern.c
@@ -0,0 +1,75 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/seccomp.h>
+#include "bpf_helpers.h"
+
+#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
+
+struct bpf_map_def SEC("maps") progs = {
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u32),
+	.max_entries = 1024,
+};
+
+SEC("kprobe/seccomp_phase1")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	struct seccomp_data sd = {};
+
+	bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di);
+
+	/* dispatch into next BPF program depending on syscall number */
+	bpf_tail_call(ctx, &progs, sd.nr);
+
+	/* fall through -> unknown syscall */
+	if (sd.nr >= __NR_getuid && sd.nr <= __NR_getsid) {
+		char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n";
+		bpf_trace_printk(fmt, sizeof(fmt), sd.nr);
+	}
+	return 0;
+}
+
+/* we jump here when syscall number == __NR_write */
+PROG(__NR_write)(struct pt_regs *ctx)
+{
+	struct seccomp_data sd = {};
+
+	bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di);
+	if (sd.args[2] == 512) {
+		char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
+		bpf_trace_printk(fmt, sizeof(fmt),
+				 sd.args[0], sd.args[1], sd.args[2]);
+	}
+	return 0;
+}
+
+PROG(__NR_read)(struct pt_regs *ctx)
+{
+	struct seccomp_data sd = {};
+
+	bpf_probe_read(&sd, sizeof(sd), (void *)ctx->di);
+	if (sd.args[2] > 128 && sd.args[2] <= 1024) {
+		char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
+		bpf_trace_printk(fmt, sizeof(fmt),
+				 sd.args[0], sd.args[1], sd.args[2]);
+	}
+	return 0;
+}
+
+PROG(__NR_mmap)(struct pt_regs *ctx)
+{
+	char fmt[] = "mmap\n";
+	bpf_trace_printk(fmt, sizeof(fmt));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
new file mode 100644
index 000000000000..a04dd3cd4358
--- /dev/null
+++ b/samples/bpf/tracex5_user.c
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+/* install fake seccomp program to enable seccomp code path inside the kernel,
+ * so that our kprobe attached to seccomp_phase1() can be triggered
+ */
+static void install_accept_all_seccomp(void)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+		.filter = filter,
+	};
+	if (prctl(PR_SET_SECCOMP, 2, &prog))
+		perror("prctl");
+}
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	install_accept_all_seccomp();
+
+	f = popen("dd if=/dev/zero of=/dev/null count=5", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}