18 files changed, 2121 insertions, 202 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index adeaa1302f34..64335bb94f9f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -40,6 +41,7 @@ hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
 hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
+hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 
 # Libbpf dependencies
@@ -58,6 +60,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -88,6 +91,7 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
+xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 
 # Tell kbuild to always build the programs
@@ -101,6 +105,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -136,6 +141,8 @@ always += xdp_redirect_kern.o
 always += xdp_redirect_map_kern.o
 always += xdp_redirect_cpu_kern.o
 always += xdp_monitor_kern.o
+always += xdp_rxq_info_kern.o
+always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
@@ -155,6 +162,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
@@ -178,6 +186,7 @@ HOSTLOADLIBES_xdp_redirect += -lelf
 HOSTLOADLIBES_xdp_redirect_map += -lelf
 HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
+HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
@@ -192,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH)
 endif
 
 # Trick to allow make to be run from this directory
-all:
+all: $(LIBBPF)
 	$(MAKE) -C ../../ $(CURDIR)/
 
 clean:
 	$(MAKE) -C ../../ M=$(CURDIR) clean
 	@rm -f *~
 
+$(LIBBPF): FORCE
+	$(MAKE) -C $(dir $@) $(notdir $@)
+
 $(obj)/syscall_nrs.s:	$(src)/syscall_nrs.c
 	$(call if_changed_dep,cc_s_c)
 
diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 370b749f5ee6..f6bbf8f50da3 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -35,12 +35,22 @@ struct geneve_opt {
 	u8	opt_data[8]; /* hard-coded to 8 byte */
 };
 
+struct erspan_md2 {
+	__be32 timestamp;
+	__be16 sgt;
+	__be16 flags;
+};
+
 struct vxlan_metadata {
 	u32     gbp;
 };
 
 struct erspan_metadata {
-	__be32 index;
+	union {
+		__be32 index;
+		struct erspan_md2 md2;
+	} u;
+	int version;
 };
 
 SEC("gre_set_tunnel")
@@ -81,6 +91,49 @@ int _gre_get_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("ip6gretap_set_tunnel")
+int _ip6gretap_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = _htonl(0x11); /* ::11 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+	key.tunnel_label = 0xabcde;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6gretap_get_tunnel")
+int _ip6gretap_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip6 ::%x label %x\n";
+	struct bpf_tunnel_key key;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			 key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+	return TC_ACT_OK;
+}
+
 SEC("erspan_set_tunnel")
 int _erspan_set_tunnel(struct __sk_buff *skb)
 {
@@ -100,7 +153,18 @@ int _erspan_set_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	md.index = htonl(123);
+	__builtin_memset(&md, 0, sizeof(md));
+#ifdef ERSPAN_V1
+	md.version = 1;
+	md.u.index = htonl(123);
+#else
+	u8 direction = 1;
+	u16 hwid = 7;
+
+	md.version = 2;
+	md.u.md2.flags = htons((direction << 3) | (hwid << 4));
+#endif
+
 	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
 	if (ret < 0) {
 		ERROR(ret);
@@ -113,7 +177,7 @@ int _erspan_set_tunnel(struct __sk_buff *skb)
 SEC("erspan_get_tunnel")
 int _erspan_get_tunnel(struct __sk_buff *skb)
 {
-	char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n";
+	char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
 	struct bpf_tunnel_key key;
 	struct erspan_metadata md;
 	u32 index;
@@ -131,9 +195,105 @@ int _erspan_get_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	index = bpf_ntohl(md.index);
 	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv4, index);
+			key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+	char fmt2[] = "\tindex %x\n";
+
+	index = bpf_ntohl(md.u.index);
+	bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+	bpf_trace_printk(fmt2, sizeof(fmt2),
+		(ntohs(md.u.md2.flags) >> 3) & 0x1,
+		(ntohs(md.u.md2.flags) >> 4) & 0x3f,
+		bpf_ntohl(md.u.md2.timestamp));
+#endif
+
+	return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_set_tunnel")
+int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = _htonl(0x11);
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	__builtin_memset(&md, 0, sizeof(md));
+
+#ifdef ERSPAN_V1
+	md.u.index = htonl(123);
+	md.version = 1;
+#else
+	u8 direction = 0;
+	u16 hwid = 17;
+
+	md.version = 2;
+	md.u.md2.flags = htons((direction << 3) | (hwid << 4));
+#endif
+
+	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_get_tunnel")
+int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	u32 index;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+	char fmt2[] = "\tindex %x\n";
+
+	index = bpf_ntohl(md.u.index);
+	bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+	bpf_trace_printk(fmt2, sizeof(fmt2),
+		(ntohs(md.u.md2.flags) >> 3) & 0x1,
+		(ntohs(md.u.md2.flags) >> 4) & 0x3f,
+		bpf_ntohl(md.u.md2.timestamp));
+#endif
 
 	return TC_ACT_OK;
 }
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 3e8232cc04a8..1af412ec6007 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -78,7 +78,8 @@ static int test_foo_bar(void)
 	if (join_cgroup(FOO))
 		goto err;
 
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo");
 		goto err;
 	}
@@ -97,7 +98,8 @@ static int test_foo_bar(void)
 	printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -114,7 +116,8 @@ static int test_foo_bar(void)
 	       "This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -128,7 +131,8 @@ static int test_foo_bar(void)
 	       "This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -161,13 +165,15 @@ static int test_foo_bar(void)
 		goto err;
 	}
 
-	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			     BPF_F_ALLOW_OVERRIDE)) {
 		errno = 0;
 		log_err("Unexpected success attaching overridable prog to /foo/bar");
 		goto err;
 	}
 
-	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
+			     BPF_F_ALLOW_OVERRIDE)) {
 		errno = 0;
 		log_err("Unexpected success attaching overridable prog to /foo");
 		goto err;
@@ -273,27 +279,33 @@ static int test_multiprog(void)
 	if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
 		goto err;
 
-	if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_MULTI)) {
 		log_err("Attaching prog to cg1");
 		goto err;
 	}
-	if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+			     BPF_F_ALLOW_MULTI)) {
 		log_err("Unexpected success attaching the same prog to cg1");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_MULTI)) {
 		log_err("Attaching prog2 to cg1");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to cg2");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_MULTI)) {
 		log_err("Attaching prog to cg3");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to cg4");
 		goto err;
 	}
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000000..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+	echo "SUCCESS!"
+else
+	echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index 312e1722a39f..ae7f7c38309b 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -33,10 +33,43 @@ function add_gre_tunnel {
 	ip addr add dev $DEV 10.1.1.200/24
 }
 
-function add_erspan_tunnel {
+function add_ip6gretap_tunnel {
+
+	# assign ipv6 address
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
 	# in namespace
 	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123
+		ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \
+		local ::11 remote ::22
+
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# out of namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip addr add dev $DEV fc80::200/24
+	ip link set dev $DEV up
+}
+
+function add_erspan_tunnel {
+	# in namespace
+	if [ "$1" == "v1" ]; then
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200 \
+		erspan_ver 1 erspan 123
+	else
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200 \
+		erspan_ver 2 erspan_dir 1 erspan_hwid 3
+	fi
 	ip netns exec at_ns0 ip link set dev $DEV_NS up
 	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
 
@@ -46,6 +79,35 @@ function add_erspan_tunnel {
 	ip addr add dev $DEV 10.1.1.200/24
 }
 
+function add_ip6erspan_tunnel {
+
+	# assign ipv6 address
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# in namespace
+	if [ "$1" == "v1" ]; then
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local ::11 remote ::22 \
+		erspan_ver 1 erspan 123
+	else
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local ::11 remote ::22 \
+		erspan_ver 2 erspan_dir 1 erspan_hwid 7
+	fi
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# out of namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip link set dev $DEV up
+}
+
 function add_vxlan_tunnel {
 	# Set static ARP entry here because iptables set-mark works
 	# on L3 packet, as a result not applying to ARP packets,
@@ -113,18 +175,65 @@ function test_gre {
 	cleanup
 }
 
+function test_ip6gre {
+	TYPE=ip6gre
+	DEV_NS=ip6gre00
+	DEV=ip6gre11
+	config_device
+	# reuse the ip6gretap function
+	add_ip6gretap_tunnel
+	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+	# underlay
+	ping6 -c 4 ::11
+	# overlay: ipv4 over ipv6
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	ping -c 1 10.1.1.100
+	# overlay: ipv6 over ipv6
+	ip netns exec at_ns0 ping6 -c 1 fc80::200
+	cleanup
+}
+
+function test_ip6gretap {
+	TYPE=ip6gretap
+	DEV_NS=ip6gretap00
+	DEV=ip6gretap11
+	config_device
+	add_ip6gretap_tunnel
+	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+	# underlay
+	ping6 -c 4 ::11
+	# overlay: ipv4 over ipv6
+	ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200
+	ping -c 1 10.1.1.100
+	# overlay: ipv6 over ipv6
+	ip netns exec at_ns0 ping6 -c 1 fc80::200
+	cleanup
+}
+
 function test_erspan {
 	TYPE=erspan
 	DEV_NS=erspan00
 	DEV=erspan11
 	config_device
-	add_erspan_tunnel
+	add_erspan_tunnel $1
 	attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
 	ping -c 1 10.1.1.100
 	ip netns exec at_ns0 ping -c 1 10.1.1.200
 	cleanup
 }
 
+function test_ip6erspan {
+	TYPE=ip6erspan
+	DEV_NS=ip6erspan00
+	DEV=ip6erspan11
+	config_device
+	add_ip6erspan_tunnel $1
+	attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
+	ping6 -c 3 ::11
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	cleanup
+}
+
 function test_vxlan {
 	TYPE=vxlan
 	DEV_NS=vxlan00
@@ -175,9 +284,12 @@ function cleanup {
 	ip link del veth1
 	ip link del ipip11
 	ip link del gretap11
+	ip link del ip6gre11
+	ip link del ip6gretap11
 	ip link del vxlan11
 	ip link del geneve11
 	ip link del erspan11
+	ip link del ip6erspan11
 	pkill tcpdump
 	pkill cat
 	set -ex
@@ -187,8 +299,16 @@ trap cleanup 0 2 3 6 9
 cleanup
 echo "Testing GRE tunnel..."
 test_gre
+echo "Testing IP6GRE tunnel..."
+test_ip6gre
+echo "Testing IP6GRETAP tunnel..."
+test_ip6gretap
 echo "Testing ERSPAN tunnel..."
-test_erspan
+test_erspan v1
+test_erspan v2
+echo "Testing IP6ERSPAN tunnel..."
+test_ip6erspan v1
+test_ip6erspan v2
 echo "Testing VXLAN tunnel..."
 test_vxlan
 echo "Testing GENEVE tunnel..."
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000000..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	unsigned long rc = -12;
+
+	bpf_override_return(ctx, rc);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000000..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char filename[256];
+	char command[256];
+	int ret;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+	f = popen(command, "r");
+	ret = pclose(f);
+
+	return ret ? 0 : 1;
+}
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh
new file mode 100755
index 000000000000..b9c9549c4c27
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+#
+# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load
+# eBPF programs, both for XDP and clsbpf.  Shell script function
+# wrappers and even long options parsing is illustrated, for ease of
+# use.
+#
+# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs
+# that need to collaborate between XDP and TC hooks.  Thus, it is
+# convenient that the same tool load both programs that need to work
+# together.
+#
+BPF_FILE=xdp2skb_meta_kern.o
+DIR=$(dirname $0)
+
+export TC=/usr/sbin/tc
+export IP=/usr/sbin/ip
+
+function usage() {
+    echo ""
+    echo "Usage: $0 [-vfh] --dev ethX"
+    echo "  -d | --dev     :             Network device (required)"
+    echo "  --flush        :             Cleanup flush TC and XDP progs"
+    echo "  --list         : (\$LIST)     List TC and XDP progs"
+    echo "  -v | --verbose : (\$VERBOSE)  Verbose"
+    echo "  --dry-run      : (\$DRYRUN)   Dry-run only (echo commands)"
+    echo ""
+}
+
+## -- General shell logging cmds --
+function err() {
+    local exitcode=$1
+    shift
+    echo "ERROR: $@" >&2
+    exit $exitcode
+}
+
+function info() {
+    if [[ -n "$VERBOSE" ]]; then
+	echo "# $@"
+    fi
+}
+
+## -- Helper function calls --
+
+# Wrapper call for TC and IP
+# - Will display the offending command on failure
+function _call_cmd() {
+    local cmd="$1"
+    local allow_fail="$2"
+    shift 2
+    if [[ -n "$VERBOSE" ]]; then
+	echo "$(basename $cmd) $@"
+    fi
+    if [[ -n "$DRYRUN" ]]; then
+	return
+    fi
+    $cmd "$@"
+    local status=$?
+    if (( $status != 0 )); then
+	if [[ "$allow_fail" == "" ]]; then
+	    err 2 "Exec error($status) occurred cmd: \"$cmd $@\""
+	fi
+    fi
+}
+function call_tc() {
+    _call_cmd "$TC" "" "$@"
+}
+function call_tc_allow_fail() {
+    _call_cmd "$TC" "allow_fail" "$@"
+}
+function call_ip() {
+    _call_cmd "$IP" "" "$@"
+}
+
+##  --- Parse command line arguments / parameters ---
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o vfhd: \
+    --long verbose,flush,help,list,dev:,dry-run -- "$@")
+if (( $? != 0 )); then
+    err 4 "Error calling getopt"
+fi
+eval set -- "$OPTIONS"
+
+unset DEV
+unset FLUSH
+while true; do
+    case "$1" in
+	-d | --dev ) # device
+	    DEV=$2
+	    info "Device set to: DEV=$DEV" >&2
+	    shift 2
+	    ;;
+	-v | --verbose)
+	    VERBOSE=yes
+	    # info "Verbose mode: VERBOSE=$VERBOSE" >&2
+	    shift
+	    ;;
+	--dry-run )
+	    DRYRUN=yes
+	    VERBOSE=yes
+	    info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2
+	    shift
+            ;;
+	-f | --flush )
+	    FLUSH=yes
+	    shift
+	    ;;
+	--list )
+	    LIST=yes
+	    shift
+	    ;;
+	-- )
+	    shift
+	    break
+	    ;;
+	-h | --help )
+	    usage;
+	    exit 0
+	    ;;
+	* )
+	    shift
+	    break
+	    ;;
+    esac
+done
+
+FILE="$DIR/$BPF_FILE"
+if [[ ! -e $FILE ]]; then
+    err 3 "Missing BPF object file ($FILE)"
+fi
+
+if [[ -z $DEV ]]; then
+    usage
+    err 2 "Please specify network device -- required option --dev"
+fi
+
+## -- Function calls --
+
+function list_tc()
+{
+    local device="$1"
+    shift
+    info "Listing current TC ingress rules"
+    call_tc filter show dev $device ingress
+}
+
+function list_xdp()
+{
+    local device="$1"
+    shift
+    info "Listing current XDP device($device) setting"
+    call_ip link show dev $device | grep --color=auto xdp
+}
+
+function flush_tc()
+{
+    local device="$1"
+    shift
+    info "Flush TC on device: $device"
+    call_tc_allow_fail filter del dev $device ingress
+    call_tc_allow_fail qdisc del dev $device clsact
+}
+
+function flush_xdp()
+{
+    local device="$1"
+    shift
+    info "Flush XDP on device: $device"
+    call_ip link set dev $device xdp off
+}
+
+function attach_tc_mark()
+{
+    local device="$1"
+    local file="$2"
+    local prog="tc_mark"
+    shift 2
+
+    # Re-attach clsact to clear/flush existing role
+    call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null
+    call_tc            qdisc add dev $device clsact
+
+    # Attach BPF prog
+    call_tc filter add dev $device ingress \
+	    prio 1 handle 1 bpf da obj $file sec $prog
+}
+
+function attach_xdp_mark()
+{
+    local device="$1"
+    local file="$2"
+    local prog="xdp_mark"
+    shift 2
+
+    # Remove XDP prog in-case it's already loaded
+    # TODO: Need ip-link option to override/replace existing XDP prog
+    flush_xdp $device
+
+    # Attach XDP/BPF prog
+    call_ip link set dev $device xdp obj $file sec $prog
+}
+
+if [[ -n $FLUSH ]]; then
+    flush_tc  $DEV
+    flush_xdp $DEV
+    exit 0
+fi
+
+if [[ -n $LIST ]]; then
+    list_tc  $DEV
+    list_xdp $DEV
+    exit 0
+fi
+
+attach_tc_mark  $DEV $FILE
+attach_xdp_mark $DEV $FILE
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
new file mode 100644
index 000000000000..0c12048ac79f
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto transfer info from XDP to SKB, e.g. skb->mark
+ * -----------------------------------------------------------
+ * This uses the XDP data_meta infrastructure, and is a cooperation
+ * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook.
+ *
+ * Notice: This example does not use the BPF C-loader (bpf_load.c),
+ * but instead rely on the iproute2 TC tool for loading BPF-objects.
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+
+/*
+ * This struct is stored in the XDP 'data_meta' area, which is located
+ * just in-front-of the raw packet payload data.  The meaning is
+ * specific to these two BPF programs that use it as a communication
+ * channel.  XDP adjust/increase the area via a bpf-helper, and TC use
+ * boundary checks to see if data have been provided.
+ *
+ * The struct must be 4 byte aligned, which here is enforced by the
+ * struct __attribute__((aligned(4))).
+ */
+struct meta_info {
+	__u32 mark;
+} __attribute__((aligned(4)));
+
+SEC("xdp_mark")
+int _xdp_mark(struct xdp_md *ctx)
+{
+	struct meta_info *meta;
+	void *data, *data_end;
+	int ret;
+
+	/* Reserve space in-front of data pointer for our meta info.
+	 * (Notice drivers not supporting data_meta will fail here!)
+	 */
+	ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
+	if (ret < 0)
+		return XDP_ABORTED;
+
+	/* Notice: Kernel-side verifier requires that loading of
+	 * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
+	 * as pkt-data pointers are invalidated.  Helpers that require
+	 * this are determined/marked by bpf_helper_changes_pkt_data()
+	 */
+	data = (void *)(unsigned long)ctx->data;
+
+	/* Check data_meta have room for meta_info struct */
+	meta = (void *)(unsigned long)ctx->data_meta;
+	if (meta + 1 > data)
+		return XDP_ABORTED;
+
+	meta->mark = 42;
+
+	return XDP_PASS;
+}
+
+SEC("tc_mark")
+int _tc_mark(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	void *data_meta = (void *)(unsigned long)ctx->data_meta;
+	struct meta_info *meta = data_meta;
+
+	/* Check XDP gave us some data_meta */
+	if (meta + 1 > data) {
+		ctx->mark = 41;
+		 /* Skip "accept" if no data_meta is avail */
+		return TC_ACT_OK;
+	}
+
+	/* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
+	ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+
+	return TC_ACT_OK;
+}
+
+/* Manually attaching these programs:
+export DEV=ixgbe2
+export FILE=xdp2skb_meta_kern.o
+
+# via TC command
+tc qdisc del dev $DEV clsact 2> /dev/null
+tc qdisc add dev $DEV clsact
+tc filter  add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark
+tc filter show dev $DEV ingress
+
+# XDP via IP command:
+ip link set dev $DEV xdp off
+ip link set dev $DEV xdp obj $FILE sec xdp_mark
+
+# Use iptable to "see" if SKBs are marked
+iptables -I INPUT -p icmp -m mark --mark 41  # == 0x29
+iptables -I INPUT -p icmp -m mark --mark 42  # == 0x2a
+
+# Hint: catch XDP_ABORTED errors via
+perf record -e xdp:*
+perf script
+
+*/
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 2fe2f761a0d0..211db8ded0de 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -1,6 +1,7 @@
-/* XDP monitor tool, based on tracepoints
+/* SPDX-License-Identifier: GPL-2.0
+ *  Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
  *
- *  Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * XDP monitor tool, based on tracepoints
  */
 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
@@ -104,7 +105,7 @@ struct xdp_exception_ctx {
 SEC("tracepoint/xdp/xdp_exception")
 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 {
-	u64 *cnt;;
+	u64 *cnt;
 	u32 key;
 
 	key = ctx->act;
@@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 
 	return 0;
 }
+
+/* Common stats data record shared with _user.c */
+struct datarec {
+	u64 processed;
+	u64 dropped;
+	u64 info;
+};
+#define MAX_CPUS 64
+
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_CPUS,
+};
+
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int to_cpu;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+	u32 to_cpu = ctx->to_cpu;
+	struct datarec *rec;
+
+	if (to_cpu >= MAX_CPUS)
+		return 1;
+
+	rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	if (ctx->processed > 0)
+		rec->info += 1;
+
+	return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int sched;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Count times kthread yielded CPU via schedule call */
+	if (ctx->sched)
+		rec->info++;
+
+	return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index eaba165b3549..eec14520d513 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -1,4 +1,5 @@
-/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 static const char *__doc__=
  "XDP monitor tool, based on tracepoints\n"
@@ -40,6 +41,9 @@ static const struct option long_options[] = {
 	{0, 0, NULL,  0 }
 };
 
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM	5
+
 static void usage(char *argv[])
 {
 	int i;
@@ -108,23 +112,93 @@ static const char *action2str(int action)
 	return NULL;
 }
 
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 dropped;
+	__u64 info;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
 struct record {
-	__u64 counter;
 	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct u64rec {
+	__u64 processed;
+};
+struct record_u64 {
+	/* record for _kern side __u64 values */
+	__u64 timestamp;
+	struct u64rec total;
+	struct u64rec *cpu;
 };
 
 struct stats_record {
-	struct record xdp_redir[REDIR_RES_MAX];
-	struct record xdp_exception[XDP_ACTION_MAX];
+	struct record_u64 xdp_redirect[REDIR_RES_MAX];
+	struct record_u64 xdp_exception[XDP_ACTION_MAX];
+	struct record xdp_cpumap_kthread;
+	struct record xdp_cpumap_enqueue[MAX_CPUS];
 };
 
-static void stats_print_headers(bool err_only)
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
 {
-	if (err_only)
-		printf("\n%s\n", __doc_err_only__);
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_dropped = 0;
+	__u64 sum_info = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
 
-	printf("%-14s %-11s %-10s %-18s %-9s\n",
-	       "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].dropped = values[i].dropped;
+		sum_dropped        += values[i].dropped;
+		rec->cpu[i].info = values[i].info;
+		sum_info        += values[i].info;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.dropped   = sum_dropped;
+	rec->total.info      = sum_info;
+	return true;
+}
+
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct u64rec values[nr_cpus];
+	__u64 sum_total = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_total            += values[i].processed;
+	}
+	rec->total.processed = sum_total;
+	return true;
 }
 
 static double calc_period(struct record *r, struct record *p)
@@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p)
 	return period_;
 }
 
-static double calc_pps(struct record *r, struct record *p, double period)
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->dropped - p->dropped;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
 {
 	__u64 packets = 0;
 	double pps = 0;
 
 	if (period > 0) {
-		packets = r->counter - p->counter;
+		packets = r->info - p->info;
 		pps = packets / period;
 	}
 	return pps;
 }
 
-static void stats_print(struct stats_record *rec,
-			struct stats_record *prev,
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
 			bool err_only)
 {
-	double period = 0, pps = 0;
-	struct record *r, *p;
-	int i = 0;
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	int rec_i = 0, i, to_cpu;
+	double t = 0, pps = 0;
 
-	char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+	/* Header */
+	printf("%-15s %-7s %-12s %-12s %-9s\n",
+	       "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
 
 	/* tracepoint: xdp:xdp_redirect_* */
 	if (err_only)
-		i = REDIR_ERROR;
-
-	for (; i < REDIR_RES_MAX; i++) {
-		r = &rec->xdp_redir[i];
-		p = &prev->xdp_redir[i];
-
-		if (p->timestamp) {
-			period = calc_period(r, p);
-			pps = calc_pps(r, p, period);
+		rec_i = REDIR_ERROR;
+
+	for (; rec_i < REDIR_RES_MAX; rec_i++) {
+		struct record_u64 *rec, *prev;
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+		rec  =  &stats_rec->xdp_redirect[rec_i];
+		prev = &stats_prev->xdp_redirect[rec_i];
+		t = calc_period_u64(rec, prev);
+
+		for (i = 0; i < nr_cpus; i++) {
+			struct u64rec *r = &rec->cpu[i];
+			struct u64rec *p = &prev->cpu[i];
+
+			pps = calc_pps_u64(r, p, t);
+			if (pps > 0)
+				printf(fmt1, "XDP_REDIRECT", i,
+				       rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+				       err2str(rec_i));
 		}
-		printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+		pps = calc_pps_u64(&rec->total, &prev->total, t);
+		printf(fmt2, "XDP_REDIRECT", "total",
+		       rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
 	}
 
 	/* tracepoint: xdp:xdp_exception */
-	for (i = 0; i < XDP_ACTION_MAX; i++) {
-		r = &rec->xdp_exception[i];
-		p = &prev->xdp_exception[i];
-		if (p->timestamp) {
-			period = calc_period(r, p);
-			pps = calc_pps(r, p, period);
+	for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+		struct record_u64 *rec, *prev;
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+		rec  =  &stats_rec->xdp_exception[rec_i];
+		prev = &stats_prev->xdp_exception[rec_i];
+		t = calc_period_u64(rec, prev);
+
+		for (i = 0; i < nr_cpus; i++) {
+			struct u64rec *r = &rec->cpu[i];
+			struct u64rec *p = &prev->cpu[i];
+
+			pps = calc_pps_u64(r, p, t);
+			if (pps > 0)
+				printf(fmt1, "Exception", i,
+				       0.0, pps, err2str(rec_i));
 		}
+		pps = calc_pps_u64(&rec->total, &prev->total, t);
 		if (pps > 0)
-			printf(fmt, action2str(i), "Exception",
-			       pps, pps, period);
+			printf(fmt2, "Exception", "total",
+			       0.0, pps, action2str(rec_i));
 	}
-	printf("\n");
-}
 
-static __u64 get_key32_value64_percpu(int fd, __u32 key)
-{
-	/* For percpu maps, userspace gets a value per possible CPU */
-	unsigned int nr_cpus = bpf_num_possible_cpus();
-	__u64 values[nr_cpus];
-	__u64 sum = 0;
-	int i;
-
-	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-		fprintf(stderr,
-			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-		return 0;
+	/* cpumap enqueue stats */
+	for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+		char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		char *info_str = "";
+		double drop, info;
+
+		rec  =  &stats_rec->xdp_cpumap_enqueue[to_cpu];
+		prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				info_str = "bulk-average";
+				info = pps / info; /* calc average bulk size */
+			}
+			if (pps > 0)
+				printf(fmt1, "cpumap-enqueue",
+				       i, to_cpu, pps, drop, info, info_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		if (pps > 0) {
+			drop = calc_drop(&rec->total, &prev->total, t);
+			info = calc_info(&rec->total, &prev->total, t);
+			if (info > 0) {
+				info_str = "bulk-average";
+				info = pps / info; /* calc average bulk size */
+			}
+			printf(fmt2, "cpumap-enqueue",
+			       "sum", to_cpu, pps, drop, info, info_str);
+		}
 	}
 
-	/* Sum values from each CPU */
-	for (i = 0; i < nr_cpus; i++) {
-		sum += values[i];
+	/* cpumap kthread stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_cpumap_kthread;
+		prev = &stats_prev->xdp_cpumap_kthread;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0)
+				i_str = "sched";
+			if (pps > 0)
+				printf(fmt1, "cpumap-kthread",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0)
+			i_str = "sched-sum";
+		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
-	return sum;
+
+	printf("\n");
 }
 
 static bool stats_collect(struct stats_record *rec)
@@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec)
 	 */
 
 	fd = map_data[0].fd; /* map0: redirect_err_cnt */
-	for (i = 0; i < REDIR_RES_MAX; i++) {
-		rec->xdp_redir[i].timestamp = gettime();
-		rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
-	}
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
 	fd = map_data[1].fd; /* map1: exception_cnt */
 	for (i = 0; i < XDP_ACTION_MAX; i++) {
-		rec->xdp_exception[i].timestamp = gettime();
-		rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+		map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
 	}
 
+	fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+	for (i = 0; i < MAX_CPUS; i++)
+		map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
 	return true;
 }
 
+static void *alloc_rec_per_cpu(int record_size)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	void *array;
+	size_t size;
+
+	size = record_size * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	struct stats_record *rec;
+	int rec_sz;
+	int i;
+
+	/* Alloc main stats_record structure */
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+
+	/* Alloc stats stored per CPU for each record */
+	rec_sz = sizeof(struct u64rec);
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	rec_sz = sizeof(struct datarec);
+	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+
+	for (i = 0; i < MAX_CPUS; i++)
+		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	int i;
+
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		free(r->xdp_redirect[i].cpu);
+
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		free(r->xdp_exception[i].cpu);
+
+	free(r->xdp_cpumap_kthread.cpu);
+
+	for (i = 0; i < MAX_CPUS; i++)
+		free(r->xdp_cpumap_enqueue[i].cpu);
+
+	free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
 static void stats_poll(int interval, bool err_only)
 {
-	struct stats_record rec, prev;
+	struct stats_record *rec, *prev;
 
-	memset(&rec, 0, sizeof(rec));
+	rec  = alloc_stats_record();
+	prev = alloc_stats_record();
+	stats_collect(rec);
+
+	if (err_only)
+		printf("\n%s\n", __doc_err_only__);
 
 	/* Trick to pretty printf with thousands separators use %' */
 	setlocale(LC_NUMERIC, "en_US");
@@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only)
 	fflush(stdout);
 
 	while (1) {
-		memcpy(&prev, &rec, sizeof(rec));
-		stats_collect(&rec);
-		stats_print_headers(err_only);
-		stats_print(&rec, &prev, err_only);
+		swap(&prev, &rec);
+		stats_collect(rec);
+		stats_print(rec, prev, err_only);
 		fflush(stdout);
 		sleep(interval);
 	}
+
+	free_stats_record(rec);
+	free_stats_record(prev);
 }
 
 static void print_bpf_prog_info(void)
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c
new file mode 100644
index 000000000000..3fd209291653
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_kern.c
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ *  Example howto extract XDP RX-queue info
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Config setup from with userspace
+ *
+ * User-side setup ifindex in config_map, to verify that
+ * ctx->ingress_ifindex is correct (against configured ifindex)
+ */
+struct config {
+	__u32 action;
+	int ifindex;
+};
+struct bpf_map_def SEC("maps") config_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(int),
+	.value_size	= sizeof(struct config),
+	.max_entries	= 1,
+};
+
+/* Common stats data record (shared with userspace) */
+struct datarec {
+	__u64 processed;
+	__u64 issue;
+};
+
+struct bpf_map_def SEC("maps") stats_global_map = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+#define MAX_RXQs 64
+
+/* Stats per rx_queue_index (per CPU) */
+struct bpf_map_def SEC("maps") rx_queue_index_map = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_RXQs + 1,
+};
+
+SEC("xdp_prog0")
+int  xdp_prognum0(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct datarec *rec, *rxq_rec;
+	int ingress_ifindex;
+	struct config *config;
+	u32 key = 0;
+
+	/* Global stats record */
+	rec = bpf_map_lookup_elem(&stats_global_map, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	/* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
+	 * instructions inside kernel to access xdp_rxq->dev->ifindex
+	 */
+	ingress_ifindex = ctx->ingress_ifindex;
+
+	config = bpf_map_lookup_elem(&config_map, &key);
+	if (!config)
+		return XDP_ABORTED;
+
+	/* Simple test: check ctx provided ifindex is as expected */
+	if (ingress_ifindex != config->ifindex) {
+		/* count this error case */
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	/* Update stats per rx_queue_index. Handle if rx_queue_index
+	 * is larger than stats map can contain info for.
+	 */
+	key = ctx->rx_queue_index;
+	if (key >= MAX_RXQs)
+		key = MAX_RXQs;
+	rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
+	if (!rxq_rec)
+		return XDP_ABORTED;
+	rxq_rec->processed++;
+	if (key == MAX_RXQs)
+		rxq_rec->issue++;
+
+	return config->action;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
new file mode 100644
index 000000000000..32430e8b3a6a
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ = " XDP RX-queue info extract example\n\n"
+	"Monitor how many packets per sec (pps) are received\n"
+	"per NIC RX queue index and which CPU processed the packet\n"
+	;
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK		0
+#define EXIT_FAIL		1
+#define EXIT_FAIL_OPTION	2
+#define EXIT_FAIL_XDP		3
+#define EXIT_FAIL_BPF		4
+#define EXIT_FAIL_MEM		5
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"dev",		required_argument,	NULL, 'd' },
+	{"skb-mode",	no_argument,		NULL, 'S' },
+	{"sec",		required_argument,	NULL, 's' },
+	{"no-separators", no_argument,		NULL, 'z' },
+	{"action",	required_argument,	NULL, 'a' },
+	{0, 0, NULL,  0 }
+};
+
+static void int_exit(int sig)
+{
+	fprintf(stderr,
+		"Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+		ifindex, ifname);
+	if (ifindex > -1)
+		set_link_xdp_fd(ifindex, -1, xdp_flags);
+	exit(EXIT_OK);
+}
+
+struct config {
+	__u32 action;
+	int ifindex;
+};
+#define XDP_ACTION_MAX (XDP_TX + 1)
+#define XDP_ACTION_MAX_STRLEN 11
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+	[XDP_ABORTED]	= "XDP_ABORTED",
+	[XDP_DROP]	= "XDP_DROP",
+	[XDP_PASS]	= "XDP_PASS",
+	[XDP_TX]	= "XDP_TX",
+};
+
+static const char *action2str(int action)
+{
+	if (action < XDP_ACTION_MAX)
+		return xdp_action_names[action];
+	return NULL;
+}
+
+static int parse_xdp_action(char *action_str)
+{
+	size_t maxlen;
+	__u64 action = -1;
+	int i;
+
+	for (i = 0; i < XDP_ACTION_MAX; i++) {
+		maxlen = XDP_ACTION_MAX_STRLEN;
+		if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
+			action = i;
+			break;
+		}
+	}
+	return action;
+}
+
+static void list_xdp_actions(void)
+{
+	int i;
+
+	printf("Available XDP --action <options>\n");
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		printf("\t%s\n", xdp_action_names[i]);
+	printf("\n");
+}
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+				*long_options[i].flag);
+		else
+			printf(" short-option: -%c",
+				long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+	list_xdp_actions();
+}
+
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+	struct timespec t;
+	int res;
+
+	res = clock_gettime(CLOCK_MONOTONIC, &t);
+	if (res < 0) {
+		fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+		exit(EXIT_FAIL);
+	}
+	return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 issue;
+};
+struct record {
+	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct stats_record {
+	struct record stats;
+	struct record *rxq;
+};
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec *array;
+	size_t size;
+
+	size = sizeof(struct datarec) * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct record *alloc_record_per_rxq(void)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	struct record *array;
+	size_t size;
+
+	size = sizeof(struct record) * nr_rxqs;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	struct stats_record *rec;
+	int i;
+
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+	rec->rxq = alloc_record_per_rxq();
+	for (i = 0; i < nr_rxqs; i++)
+		rec->rxq[i].cpu = alloc_record_per_cpu();
+
+	rec->stats.cpu = alloc_record_per_cpu();
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	int i;
+
+	for (i = 0; i < nr_rxqs; i++)
+		free(r->rxq[i].cpu);
+
+	free(r->rxq);
+	free(r->stats.cpu);
+	free(r);
+}
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_issue = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].issue = values[i].issue;
+		sum_issue        += values[i].issue;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.issue     = sum_issue;
+	return true;
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+	int fd, i, max_rxqs;
+
+	fd = map_data[1].fd; /* map: stats_global_map */
+	map_collect_percpu(fd, 0, &rec->stats);
+
+	fd = map_data[2].fd; /* map: rx_queue_index_map */
+	max_rxqs = map_data[2].def.max_entries;
+	for (i = 0; i < max_rxqs; i++)
+		map_collect_percpu(fd, i, &rec->rxq[i]);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+			    struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->issue - p->issue;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
+			int action)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	double pps = 0, err = 0;
+	struct record *rec, *prev;
+	double t;
+	int rxq;
+	int i;
+
+	/* Header */
+	printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n",
+	       ifname, ifindex, action2str(action));
+
+	/* stats_global_map */
+	{
+		char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %-7s %'-11.0f\n";
+		char *errstr = "";
+
+		printf("%-15s %-7s %-11s %-11s\n",
+		       "XDP stats", "CPU", "pps", "issue-pps");
+
+		rec  =  &stats_rec->stats;
+		prev = &stats_prev->stats;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps     (r, p, t);
+			err = calc_errs_pps(r, p, t);
+			if (err > 0)
+				errstr = "invalid-ifindex";
+			if (pps > 0)
+				printf(fmt_rx, "XDP-RX CPU",
+					i, pps, err, errstr);
+		}
+		pps  = calc_pps     (&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
+	}
+
+	/* rx_queue_index_map */
+	printf("\n%-15s %-7s %-11s %-11s\n",
+	       "RXQ stats", "RXQ:CPU", "pps", "issue-pps");
+
+	for (rxq = 0; rxq < nr_rxqs; rxq++) {
+		char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
+		char *errstr = "";
+		int rxq_ = rxq;
+
+		/* Last RXQ in map catch overflows */
+		if (rxq_ == nr_rxqs - 1)
+			rxq_ = -1;
+
+		rec  =  &stats_rec->rxq[rxq];
+		prev = &stats_prev->rxq[rxq];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps     (r, p, t);
+			err = calc_errs_pps(r, p, t);
+			if (err > 0) {
+				if (rxq_ == -1)
+					errstr = "map-overflow-RXQ";
+				else
+					errstr = "err";
+			}
+			if (pps > 0)
+				printf(fmt_rx, "rx_queue_index",
+				       rxq_, i, pps, err, errstr);
+		}
+		pps  = calc_pps     (&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		if (pps || err)
+			printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
+	}
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+static void stats_poll(int interval, int action)
+{
+	struct stats_record *record, *prev;
+
+	record = alloc_stats_record();
+	prev   = alloc_stats_record();
+	stats_collect(record);
+
+	while (1) {
+		swap(&prev, &record);
+		stats_collect(record);
+		stats_print(record, prev, action);
+		sleep(interval);
+	}
+
+	free_stats_record(record);
+	free_stats_record(prev);
+}
+
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	bool use_separators = true;
+	struct config cfg = { 0 };
+	char filename[256];
+	int longindex = 0;
+	int interval = 2;
+	__u32 key = 0;
+	int opt, err;
+
+	char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
+	int action = XDP_PASS; /* Default action */
+	char *action_str = NULL;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+		return EXIT_FAIL;
+	}
+
+	if (!prog_fd[0]) {
+		fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+		return EXIT_FAIL;
+	}
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hSd:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'd':
+			if (strlen(optarg) >= IF_NAMESIZE) {
+				fprintf(stderr, "ERR: --dev name too long\n");
+				goto error;
+			}
+			ifname = (char *)&ifname_buf;
+			strncpy(ifname, optarg, IF_NAMESIZE);
+			ifindex = if_nametoindex(ifname);
+			if (ifindex == 0) {
+				fprintf(stderr,
+					"ERR: --dev name unknown err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 's':
+			interval = atoi(optarg);
+			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'z':
+			use_separators = false;
+			break;
+		case 'a':
+			action_str = (char *)&action_str_buf;
+			strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
+			break;
+		case 'h':
+		error:
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	/* Required option */
+	if (ifindex == -1) {
+		fprintf(stderr, "ERR: required option --dev missing\n");
+		usage(argv);
+		return EXIT_FAIL_OPTION;
+	}
+	cfg.ifindex = ifindex;
+
+	/* Parse action string */
+	if (action_str) {
+		action = parse_xdp_action(action_str);
+		if (action < 0) {
+			fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
+				action_str);
+			list_xdp_actions();
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	cfg.action = action;
+
+	/* Trick to pretty printf with thousands separators use %' */
+	if (use_separators)
+		setlocale(LC_NUMERIC, "en_US");
+
+	/* User-side setup ifindex in config_map */
+	err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
+	if (err) {
+		fprintf(stderr, "Store config failed (err:%d)\n", err);
+		exit(EXIT_FAIL_BPF);
+	}
+
+	/* Remove XDP program when program is interrupted */
+	signal(SIGINT, int_exit);
+
+	if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+		fprintf(stderr, "link set xdp fd failed\n");
+		return EXIT_FAIL_XDP;
+	}
+
+	stats_poll(interval, action);
+	return EXIT_OK;
+}
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
index 3d115bd68442..72f9e6d1387b 100644
--- a/samples/livepatch/livepatch-callbacks-demo.c
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -197,21 +197,6 @@ static int livepatch_callbacks_demo_init(void)
 {
 	int ret;
 
-	if (!klp_have_reliable_stack() && !patch.immediate) {
-		/*
-		 * WARNING: Be very careful when using 'patch.immediate' in
-		 * your patches.  It's ok to use it for simple patches like
-		 * this, but for more complex patches which change function
-		 * semantics, locking semantics, or data structures, it may not
-		 * be safe.  Use of this option will also prevent removal of
-		 * the patch.
-		 *
-		 * See Documentation/livepatch/livepatch.txt for more details.
-		 */
-		patch.immediate = true;
-		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
-	}
-
 	ret = klp_register_patch(&patch);
 	if (ret)
 		return ret;
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
index 84795223f15f..2d554dd930e2 100644
--- a/samples/livepatch/livepatch-sample.c
+++ b/samples/livepatch/livepatch-sample.c
@@ -71,21 +71,6 @@ static int livepatch_init(void)
 {
 	int ret;
 
-	if (!klp_have_reliable_stack() && !patch.immediate) {
-		/*
-		 * WARNING: Be very careful when using 'patch.immediate' in
-		 * your patches.  It's ok to use it for simple patches like
-		 * this, but for more complex patches which change function
-		 * semantics, locking semantics, or data structures, it may not
-		 * be safe.  Use of this option will also prevent removal of
-		 * the patch.
-		 *
-		 * See Documentation/livepatch/livepatch.txt for more details.
-		 */
-		patch.immediate = true;
-		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
-	}
-
 	ret = klp_register_patch(&patch);
 	if (ret)
 		return ret;
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
index fbe0a1f3d99b..830c55514f9f 100644
--- a/samples/livepatch/livepatch-shadow-fix1.c
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -133,21 +133,6 @@ static int livepatch_shadow_fix1_init(void)
 {
 	int ret;
 
-	if (!klp_have_reliable_stack() && !patch.immediate) {
-		/*
-		 * WARNING: Be very careful when using 'patch.immediate' in
-		 * your patches.  It's ok to use it for simple patches like
-		 * this, but for more complex patches which change function
-		 * semantics, locking semantics, or data structures, it may not
-		 * be safe.  Use of this option will also prevent removal of
-		 * the patch.
-		 *
-		 * See Documentation/livepatch/livepatch.txt for more details.
-		 */
-		patch.immediate = true;
-		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
-	}
-
 	ret = klp_register_patch(&patch);
 	if (ret)
 		return ret;
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
index 53c1794bdc5f..ff9948f0ec00 100644
--- a/samples/livepatch/livepatch-shadow-fix2.c
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -128,21 +128,6 @@ static int livepatch_shadow_fix2_init(void)
 {
 	int ret;
 
-	if (!klp_have_reliable_stack() && !patch.immediate) {
-		/*
-		 * WARNING: Be very careful when using 'patch.immediate' in
-		 * your patches.  It's ok to use it for simple patches like
-		 * this, but for more complex patches which change function
-		 * semantics, locking semantics, or data structures, it may not
-		 * be safe.  Use of this option will also prevent removal of
-		 * the patch.
-		 *
-		 * See Documentation/livepatch/livepatch.txt for more details.
-		 */
-		patch.immediate = true;
-		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
-	}
-
 	ret = klp_register_patch(&patch);
 	if (ret)
 		return ret;
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7cc9d228216f..7c25c0c112bc 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -23,8 +23,11 @@
 #include <stdbool.h>
 #include <signal.h>
 #include <fcntl.h>
+#include <sys/wait.h>
+#include <time.h>
 
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 
 #include <linux/netlink.h>
@@ -35,6 +38,8 @@
 #include <assert.h>
 #include <libgen.h>
 
+#include <getopt.h>
+
 #include "../bpf/bpf_load.h"
 #include "../bpf/bpf_util.h"
 #include "../bpf/libbpf.h"
@@ -46,15 +51,42 @@ void running_handler(int a);
 #define S1_PORT 10000
 #define S2_PORT 10001
 
-static int sockmap_test_sockets(int rate, int dot)
+/* global sockets */
+int s1, s2, c1, c2, p1, p2;
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"cgroup",	required_argument,	NULL, 'c' },
+	{"rate",	required_argument,	NULL, 'r' },
+	{"verbose",	no_argument,		NULL, 'v' },
+	{"iov_count",	required_argument,	NULL, 'i' },
+	{"length",	required_argument,	NULL, 'l' },
+	{"test",	required_argument,	NULL, 't' },
+	{0, 0, NULL, 0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
+	printf(" options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)\n",
+				*long_options[i].flag);
+		else
+			printf(" -%c\n", long_options[i].val);
+	}
+	printf("\n");
+}
+
+static int sockmap_init_sockets(void)
 {
-	int i, sc, err, max_fd, one = 1;
-	int s1, s2, c1, c2, p1, p2;
+	int i, err, one = 1;
 	struct sockaddr_in addr;
-	struct timeval timeout;
-	char buf[1024] = {0};
 	int *fds[4] = {&s1, &s2, &c1, &c2};
-	fd_set w;
 
 	s1 = s2 = p1 = p2 = c1 = c2 = 0;
 
@@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot)
 		*fds[i] = socket(AF_INET, SOCK_STREAM, 0);
 		if (*fds[i] < 0) {
 			perror("socket s1 failed()");
-			err = *fds[i];
-			goto out;
+			return errno;
 		}
 	}
 
@@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot)
 				 (char *)&one, sizeof(one));
 		if (err) {
 			perror("setsockopt failed()");
-			goto out;
+			return errno;
 		}
 	}
 
 	/* Non-blocking sockets */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < 2; i++) {
 		err = ioctl(*fds[i], FIONBIO, (char *)&one);
 		if (err < 0) {
 			perror("ioctl s1 failed()");
-			goto out;
+			return errno;
 		}
 	}
 
@@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
 		perror("bind s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
 		perror("bind s2 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	/* Listen server sockets */
@@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = listen(s1, 32);
 	if (err < 0) {
 		perror("listen s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = listen(s2, 32);
 	if (err < 0) {
 		perror("listen s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	/* Initiate Connect */
@@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
 		perror("connect c1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
 		perror("connect c2 failed()\n");
-		goto out;
+		return errno;
+	} else if (err < 0) {
+		err = 0;
 	}
 
 	/* Accept Connecrtions */
 	p1 = accept(s1, NULL, NULL);
 	if (p1 < 0) {
 		perror("accept s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	p2 = accept(s2, NULL, NULL);
 	if (p2 < 0) {
 		perror("accept s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
-	max_fd = p2;
-	timeout.tv_sec = 10;
-	timeout.tv_usec = 0;
-
 	printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
 	printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
 		c1, s1, c2, s2);
+	return 0;
+}
+
+struct msg_stats {
+	size_t bytes_sent;
+	size_t bytes_recvd;
+	struct timespec start;
+	struct timespec end;
+};
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+		    struct msg_stats *s, bool tx)
+{
+	struct msghdr msg = {0};
+	int err, i, flags = MSG_NOSIGNAL;
+	struct iovec *iov;
+
+	iov = calloc(iov_count, sizeof(struct iovec));
+	if (!iov)
+		return errno;
+
+	for (i = 0; i < iov_count; i++) {
+		char *d = calloc(iov_length, sizeof(char));
+
+		if (!d) {
+			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
+			goto out_errno;
+		}
+		iov[i].iov_base = d;
+		iov[i].iov_len = iov_length;
+	}
+
+	msg.msg_iov = iov;
+	msg.msg_iovlen = iov_count;
+
+	if (tx) {
+		clock_gettime(CLOCK_MONOTONIC, &s->start);
+		for (i = 0; i < cnt; i++) {
+			int sent = sendmsg(fd, &msg, flags);
+
+			if (sent < 0) {
+				perror("send loop error:");
+				goto out_errno;
+			}
+			s->bytes_sent += sent;
+		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
+	} else {
+		int slct, recv, max_fd = fd;
+		struct timeval timeout;
+		float total_bytes;
+		fd_set w;
+
+		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
+		if (err < 0)
+			perror("recv start time: ");
+		while (s->bytes_recvd < total_bytes) {
+			timeout.tv_sec = 1;
+			timeout.tv_usec = 0;
+
+			/* FD sets */
+			FD_ZERO(&w);
+			FD_SET(fd, &w);
+
+			slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
+			if (slct == -1) {
+				perror("select()");
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
+				goto out_errno;
+			} else if (!slct) {
+				fprintf(stderr, "unexpected timeout\n");
+				errno = -EIO;
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
+				goto out_errno;
+			}
+
+			recv = recvmsg(fd, &msg, flags);
+			if (recv < 0) {
+				if (errno != EWOULDBLOCK) {
+					clock_gettime(CLOCK_MONOTONIC, &s->end);
+					perror("recv failed()\n");
+					goto out_errno;
+				}
+			}
+
+			s->bytes_recvd += recv;
+		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
+	}
+
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return 0;
+out_errno:
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return errno;
+}
+
+static float giga = 1000000000;
+
+static inline float sentBps(struct msg_stats s)
+{
+	return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static inline float recvdBps(struct msg_stats s)
+{
+	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static int sendmsg_test(int iov_count, int iov_buf, int cnt,
+			int verbose, bool base)
+{
+	float sent_Bps = 0, recvd_Bps = 0;
+	int rx_fd, txpid, rxpid, err = 0;
+	struct msg_stats s = {0};
+	int status;
+
+	errno = 0;
+
+	if (base)
+		rx_fd = p1;
+	else
+		rx_fd = p2;
+
+	rxpid = fork();
+	if (rxpid == 0) {
+		err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false);
+		if (err)
+			fprintf(stderr,
+				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		shutdown(p2, SHUT_RDWR);
+		shutdown(p1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		fprintf(stdout,
+			"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+			s.bytes_sent, sent_Bps, sent_Bps/giga,
+			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		exit(1);
+	} else if (rxpid == -1) {
+		perror("msg_loop_rx: ");
+		return errno;
+	}
+
+	txpid = fork();
+	if (txpid == 0) {
+		err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
+		if (err)
+			fprintf(stderr,
+				"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		shutdown(c1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		fprintf(stdout,
+			"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+			s.bytes_sent, sent_Bps, sent_Bps/giga,
+			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		exit(1);
+	} else if (txpid == -1) {
+		perror("msg_loop_tx: ");
+		return errno;
+	}
+
+	assert(waitpid(rxpid, &status, 0) == rxpid);
+	assert(waitpid(txpid, &status, 0) == txpid);
+	return err;
+}
+
+static int forever_ping_pong(int rate, int verbose)
+{
+	struct timeval timeout;
+	char buf[1024] = {0};
+	int sc;
+
+	timeout.tv_sec = 10;
+	timeout.tv_usec = 0;
 
 	/* Ping/Pong data from client to server */
 	sc = send(c1, buf, sizeof(buf), 0);
 	if (sc < 0) {
 		perror("send failed()\n");
-		goto out;
+		return sc;
 	}
 
 	do {
-		int s, rc, i;
+		int s, rc, i, max_fd = p2;
+		fd_set w;
 
 		/* FD sets */
 		FD_ZERO(&w);
@@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot)
 			if (rc < 0) {
 				if (errno != EWOULDBLOCK) {
 					perror("recv failed()\n");
-					break;
+					return rc;
 				}
 			}
 
@@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot)
 			sc = send(i, buf, rc, 0);
 			if (sc < 0) {
 				perror("send failed()\n");
-				break;
+				return sc;
 			}
 		}
-		sleep(rate);
-		if (dot) {
+
+		if (rate)
+			sleep(rate);
+
+		if (verbose) {
 			printf(".");
 			fflush(stdout);
 
 		}
 	} while (running);
 
-out:
-	close(s1);
-	close(s2);
-	close(p1);
-	close(p2);
-	close(c1);
-	close(c2);
-	return err;
+	return 0;
 }
 
+enum {
+	PING_PONG,
+	SENDMSG,
+	BASE,
+};
+
 int main(int argc, char **argv)
 {
-	int rate = 1, dot = 1;
+	int iov_count = 1, length = 1024, rate = 1, verbose = 0;
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	int opt, longindex, err, cg_fd = 0;
+	int test = PING_PONG;
 	char filename[256];
-	int err, cg_fd;
-	char *cg_path;
 
-	cg_path = argv[argc - 1];
+	while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		/* Cgroup configuration */
+		case 'c':
+			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+			if (cg_fd < 0) {
+				fprintf(stderr,
+					"ERROR: (%i) open cg path failed: %s\n",
+					cg_fd, optarg);
+				return cg_fd;
+			}
+			break;
+		case 'r':
+			rate = atoi(optarg);
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'i':
+			iov_count = atoi(optarg);
+			break;
+		case 'l':
+			length = atoi(optarg);
+			break;
+		case 't':
+			if (strcmp(optarg, "ping") == 0) {
+				test = PING_PONG;
+			} else if (strcmp(optarg, "sendmsg") == 0) {
+				test = SENDMSG;
+			} else if (strcmp(optarg, "base") == 0) {
+				test = BASE;
+			} else {
+				usage(argv);
+				return -1;
+			}
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return -1;
+		}
+	}
+
+	if (!cg_fd) {
+		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
+			argv[0]);
+		return -1;
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
 	running = 1;
@@ -241,20 +515,16 @@ int main(int argc, char **argv)
 	/* catch SIGINT */
 	signal(SIGINT, running_handler);
 
+	/* If base test skip BPF setup */
+	if (test == BASE)
+		goto run;
+
 	if (load_bpf_file(filename)) {
 		fprintf(stderr, "load_bpf_file: (%s) %s\n",
 			filename, strerror(errno));
 		return 1;
 	}
 
-	/* Cgroup configuration */
-	cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
-	if (cg_fd < 0) {
-		fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n",
-			cg_fd, cg_path);
-		return cg_fd;
-	}
-
 	/* Attach programs to sockmap */
 	err = bpf_prog_attach(prog_fd[0], map_fd[0],
 				BPF_SK_SKB_STREAM_PARSER, 0);
@@ -280,12 +550,30 @@ int main(int argc, char **argv)
 		return err;
 	}
 
-	err = sockmap_test_sockets(rate, dot);
+run:
+	err = sockmap_init_sockets();
 	if (err) {
 		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
-		return err;
+		goto out;
 	}
-	return 0;
+
+	if (test == PING_PONG)
+		err = forever_ping_pong(rate, verbose);
+	else if (test == SENDMSG)
+		err = sendmsg_test(iov_count, length, rate, verbose, false);
+	else if (test == BASE)
+		err = sendmsg_test(iov_count, length, rate, verbose, true);
+	else
+		fprintf(stderr, "unknown test\n");
+out:
+	close(s1);
+	close(s2);
+	close(p1);
+	close(p2);
+	close(c1);
+	close(c2);
+	close(cg_fd);
+	return err;
 }
 
 void running_handler(int a)