From a493f5f9d8c2e87f2d1322c01e477502b3cd11c5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 10:23:13 -0700
Subject: libbpf: Install btf.h with libbpf

install_headers target should contain all headers that are part of
libbpf. Add missing btf.h

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index f3fab4af4260..5390e7725e43 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -189,6 +189,7 @@ install_headers:
 	$(call QUIET_INSTALL, headers) \
 		$(call do_install,bpf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,libbpf.h,$(prefix)/include/bpf,644);
+		$(call do_install,btf.h,$(prefix)/include/bpf,644);
 
 install: install_lib
 
-- 
cgit v1.2.3


From 53c8036cb715f3577a7fe1db6e6ad06e8697b36f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 May 2018 23:33:19 +0200
Subject: bpf: btf: avoid -Wreturn-type warning

gcc warns about a noreturn function possibly returning in
some configurations:

kernel/bpf/btf.c: In function 'env_type_is_resolve_sink':
kernel/bpf/btf.c:729:1: error: control reaches end of non-void function [-Werror=return-type]

Using BUG() instead of BUG_ON() avoids that warning and otherwise
does the exact same thing.

Fixes: eb3f595dab40 ("bpf: btf: Validate type reference")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7e90fd13b5b5..3d20aa1f4b54 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -749,7 +749,7 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
 			!btf_type_is_array(next_type) &&
 			!btf_type_is_struct(next_type);
 	default:
-		BUG_ON(1);
+		BUG();
 	}
 }
 
-- 
cgit v1.2.3


From dc3b8ae9d271897e09b27fa4e4e0000de98590d1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 May 2018 23:33:20 +0200
Subject: bpf: avoid -Wmaybe-uninitialized warning

The stack_map_get_build_id_offset() function is too long for gcc to track
whether 'work' may or may not be initialized at the end of it, leading
to a false-positive warning:

kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset':
kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this function [-Werror=maybe-uninitialized]

This removes the 'in_nmi_ctx' flag and uses the state of that variable
itself to see if it got initialized.

Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/stackmap.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b59ace0f0f09..b675a3f3d141 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -285,11 +285,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
-	bool in_nmi_ctx = in_nmi();
 	bool irq_work_busy = false;
-	struct stack_map_irq_work *work;
+	struct stack_map_irq_work *work = NULL;
 
-	if (in_nmi_ctx) {
+	if (in_nmi()) {
 		work = this_cpu_ptr(&up_read_work);
 		if (work->irq_work.flags & IRQ_WORK_BUSY)
 			/* cannot queue more up_read, fallback */
@@ -328,7 +327,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
 
-	if (!in_nmi_ctx) {
+	if (!work) {
 		up_read(&current->mm->mmap_sem);
 	} else {
 		work->sem = &current->mm->mmap_sem;
-- 
cgit v1.2.3


From 3b296633ae6b91bfff815eb005d220b6ad4a47b1 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sat, 26 May 2018 15:44:08 +0100
Subject: selftests/bpf: missing headers test_lwt_seg6local

Previous patch "selftests/bpf: test for seg6local End.BPF action" lacks
some UAPI headers in tools/.

clang -I. -I./include/uapi -I../../../include/uapi -idirafter
/usr/local/include -idirafter
/data/users/yhs/work/llvm/build/install/lib/clang/7.0.0/include
-idirafter /usr/include -Wno-compare-distinct-pointer-types \
         -O2 -target bpf -emit-llvm -c test_lwt_seg6local.c -o - |      \
llc -march=bpf -mcpu=generic  -filetype=obj -o
[...]/net-next/tools/testing/selftests/bpf/test_lwt_seg6local.o
test_lwt_seg6local.c:4:10: fatal error: 'linux/seg6_local.h' file not found
         ^~~~~~~~~~~~~~~~~~~~
1 error generated.
make: Leaving directory
`/data/users/yhs/work/net-next/tools/testing/selftests/bpf'

v2: moving the headers to tools/include/uapi/.

Reported-by: Y Song <ys114321@gmail.com>
Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/seg6.h       | 55 ++++++++++++++++++++++++
 tools/include/uapi/linux/seg6_local.h | 80 +++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 tools/include/uapi/linux/seg6.h
 create mode 100644 tools/include/uapi/linux/seg6_local.h

diff --git a/tools/include/uapi/linux/seg6.h b/tools/include/uapi/linux/seg6.h
new file mode 100644
index 000000000000..286e8d6a8e98
--- /dev/null
+++ b/tools/include/uapi/linux/seg6.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_H
+#define _UAPI_LINUX_SEG6_H
+
+#include <linux/types.h>
+#include <linux/in6.h>		/* For struct in6_addr. */
+
+/*
+ * SRH
+ */
+struct ipv6_sr_hdr {
+	__u8	nexthdr;
+	__u8	hdrlen;
+	__u8	type;
+	__u8	segments_left;
+	__u8	first_segment; /* Represents the last_entry field of SRH */
+	__u8	flags;
+	__u16	tag;
+
+	struct in6_addr segments[0];
+};
+
+#define SR6_FLAG1_PROTECTED	(1 << 6)
+#define SR6_FLAG1_OAM		(1 << 5)
+#define SR6_FLAG1_ALERT		(1 << 4)
+#define SR6_FLAG1_HMAC		(1 << 3)
+
+#define SR6_TLV_INGRESS		1
+#define SR6_TLV_EGRESS		2
+#define SR6_TLV_OPAQUE		3
+#define SR6_TLV_PADDING		4
+#define SR6_TLV_HMAC		5
+
+#define sr_has_hmac(srh) ((srh)->flags & SR6_FLAG1_HMAC)
+
+struct sr6_tlv {
+	__u8 type;
+	__u8 len;
+	__u8 data[0];
+};
+
+#endif
diff --git a/tools/include/uapi/linux/seg6_local.h b/tools/include/uapi/linux/seg6_local.h
new file mode 100644
index 000000000000..edc138bdc56d
--- /dev/null
+++ b/tools/include/uapi/linux/seg6_local.h
@@ -0,0 +1,80 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_LOCAL_H
+#define _UAPI_LINUX_SEG6_LOCAL_H
+
+#include <linux/seg6.h>
+
+enum {
+	SEG6_LOCAL_UNSPEC,
+	SEG6_LOCAL_ACTION,
+	SEG6_LOCAL_SRH,
+	SEG6_LOCAL_TABLE,
+	SEG6_LOCAL_NH4,
+	SEG6_LOCAL_NH6,
+	SEG6_LOCAL_IIF,
+	SEG6_LOCAL_OIF,
+	SEG6_LOCAL_BPF,
+	__SEG6_LOCAL_MAX,
+};
+#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
+
+enum {
+	SEG6_LOCAL_ACTION_UNSPEC	= 0,
+	/* node segment */
+	SEG6_LOCAL_ACTION_END		= 1,
+	/* adjacency segment (IPv6 cross-connect) */
+	SEG6_LOCAL_ACTION_END_X		= 2,
+	/* lookup of next seg NH in table */
+	SEG6_LOCAL_ACTION_END_T		= 3,
+	/* decap and L2 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX2	= 4,
+	/* decap and IPv6 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX6	= 5,
+	/* decap and IPv4 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX4	= 6,
+	/* decap and lookup of DA in v6 table */
+	SEG6_LOCAL_ACTION_END_DT6	= 7,
+	/* decap and lookup of DA in v4 table */
+	SEG6_LOCAL_ACTION_END_DT4	= 8,
+	/* binding segment with insertion */
+	SEG6_LOCAL_ACTION_END_B6	= 9,
+	/* binding segment with encapsulation */
+	SEG6_LOCAL_ACTION_END_B6_ENCAP	= 10,
+	/* binding segment with MPLS encap */
+	SEG6_LOCAL_ACTION_END_BM	= 11,
+	/* lookup last seg in table */
+	SEG6_LOCAL_ACTION_END_S		= 12,
+	/* forward to SR-unaware VNF with static proxy */
+	SEG6_LOCAL_ACTION_END_AS	= 13,
+	/* forward to SR-unaware VNF with masquerading */
+	SEG6_LOCAL_ACTION_END_AM	= 14,
+	/* custom BPF action */
+	SEG6_LOCAL_ACTION_END_BPF	= 15,
+
+	__SEG6_LOCAL_ACTION_MAX,
+};
+
+#define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
+
+enum {
+	SEG6_LOCAL_BPF_PROG_UNSPEC,
+	SEG6_LOCAL_BPF_PROG,
+	SEG6_LOCAL_BPF_PROG_NAME,
+	__SEG6_LOCAL_BPF_PROG_MAX,
+};
+
+#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
+
+#endif
-- 
cgit v1.2.3


From 13193b0f392f5a65d0d54185cb95ed5e99c0a5bf Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:22 -0700
Subject: bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n

Static key is used to enable/disable cgroup-bpf related code paths at
run time.

Though it's not defined when cgroup-bpf is disabled at compile time,
i.e. CONFIG_CGROUP_BPF=n, and if some code wants to use it, it has to do
this:

	#ifdef CONFIG_CGROUP_BPF
		if (cgroup_bpf_enabled) {
			/* ... some work ... */
		}
	#endif

This code can be simplified by setting cgroup_bpf_enabled to 0 for
CONFIG_CGROUP_BPF=n case:

	if (cgroup_bpf_enabled) {
		/* ... some work ... */
	}

And it aligns well with existing BPF_CGROUP_RUN_PROG_* macros that
defined for both states of CONFIG_CGROUP_BPF.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 30d15e64b993..de8e89a3758b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -185,6 +185,7 @@ struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
+#define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
-- 
cgit v1.2.3


From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:23 -0700
Subject: bpf: Hooks for sys_sendmsg

In addition to already existing BPF hooks for sys_bind and sys_connect,
the patch provides new hooks for sys_sendmsg.

It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
that provides access to socket itlself (properties like family, type,
protocol) and user-passed `struct sockaddr *` so that BPF program can
override destination IP and port for system calls such as sendto(2) or
sendmsg(2) and/or assign source IP to the socket.

The hooks are implemented as two new attach types:
`BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
UDPv6 correspondingly.

UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
sys_connect hooks, i.e. to prevent reading from / writing to e.g.
user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.

The difference with already existing hooks is sys_sendmsg are
implemented only for unconnected UDP.

For TCP it doesn't make sense to change user-provided `struct sockaddr *`
at sendto(2)/sendmsg(2) time since socket either was already connected
and has source/destination set or wasn't connected and call to
sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.

Connected UDP is already handled by sys_connect hooks that can override
source/destination at connect time and use fast-path later, i.e. these
hooks don't affect UDP fast-path.

Rewriting source IP is implemented differently than that in sys_connect
hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
just bind socket to desired local IP address since source IP can be set
on per-packet basis by using ancillary data (cmsg(3)). So no matter if
socket is bound or not, source IP has to be rewritten on every call to
sys_sendmsg.

To do so two new fields are added to UAPI `struct bpf_sock_addr`;
* `msg_src_ip4` to set source IPv4 for UDPv4;
* `msg_src_ip6` to set source IPv6 for UDPv6.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
 include/linux/filter.h     |  1 +
 include/uapi/linux/bpf.h   |  8 ++++++++
 kernel/bpf/cgroup.c        | 11 ++++++++++-
 kernel/bpf/syscall.c       |  8 ++++++++
 net/core/filter.c          | 39 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c             | 20 ++++++++++++++++++--
 net/ipv6/udp.c             | 24 ++++++++++++++++++++++++
 8 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index de8e89a3758b..975fb4cf1bb7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type);
+				      enum bpf_attach_type type,
+				      void *t_ctx);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
@@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  NULL);	       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)			       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)	{					       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  t_ctx);	       \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
@@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d358d1815c16..d90abdaea94b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
 	 * only two (src and dst) are available at convert_ctx_access time
 	 */
 	u64 tmp_reg;
+	void *t_ctx;	/* Attach type specific context. */
 };
 
 struct bpf_sock_ops_kern {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9b8c6e310e9a..cc68787f2d97 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -160,6 +160,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2363,6 +2365,12 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43171a0bb02b..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
  * @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
  *
  * socket is expected to be of type INET or INET6.
  *
@@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type)
+				      enum bpf_attach_type type,
+				      void *t_ctx)
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
 		.uaddr = uaddr,
+		.t_ctx = t_ctx,
 	};
+	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
+	if (!ctx.uaddr) {
+		memset(&unspec, 0, sizeof(unspec));
+		ctx.uaddr = (struct sockaddr *)&unspec;
+	}
+
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 388d4feda348..e254526d6744 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1249,6 +1249,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
+		case BPF_CGROUP_UDP6_SENDMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1565,6 +1567,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1635,6 +1639,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1692,6 +1698,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index acf1f4fb99d1..24e6ce8be567 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5299,6 +5299,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET4_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5308,6 +5309,24 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP6_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP4_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP6_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5318,6 +5337,9 @@ static bool sock_addr_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
@@ -6072,6 +6094,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
 					SK_FL_PROTO_SHIFT);
 		break;
+
+	case offsetof(struct bpf_sock_addr, msg_src_ip4):
+		/* Treat t_ctx as struct in_addr for msg_src_ip4. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+			s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		off = si->off;
+		off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+		/* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d71f1f3e1155..3c27d00b5730 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -901,6 +901,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct udp_sock *up = udp_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
 	struct flowi4 fl4_stack;
 	struct flowi4 *fl4;
 	int ulen = len;
@@ -955,8 +956,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	/*
 	 *	Get and verify the address.
 	 */
-	if (msg->msg_name) {
-		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	if (usin) {
 		if (msg->msg_namelen < sizeof(*usin))
 			return -EINVAL;
 		if (usin->sin_family != AF_INET) {
@@ -1010,6 +1010,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+					    (struct sockaddr *)usin, &ipc.addr);
+		if (err)
+			goto out_free;
+		if (usin) {
+			if (usin->sin_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_free;
+			}
+			daddr = usin->sin_addr.s_addr;
+			dport = usin->sin_port;
+		}
+	}
+
 	saddr = ipc.addr;
 	ipc.addr = faddr = daddr;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 426c9d2b418d..9f729a7b8cf0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1316,6 +1316,29 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+					   (struct sockaddr *)sin6, &fl6.saddr);
+		if (err)
+			goto out_no_dst;
+		if (sin6) {
+			if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+				/* BPF program rewrote IPv6-only by IPv4-mapped
+				 * IPv6. It's currently unsupported.
+				 */
+				err = -ENOTSUPP;
+				goto out_no_dst;
+			}
+			if (sin6->sin6_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_no_dst;
+			}
+			fl6.fl6_dport = sin6->sin6_port;
+			fl6.daddr = sin6->sin6_addr;
+		}
+	}
+
 	final_p = fl6_update_dst(&fl6, opt, &final);
 	if (final_p)
 		connected = false;
@@ -1395,6 +1418,7 @@ do_append_data:
 
 out:
 	dst_release(dst);
+out_no_dst:
 	fl6_sock_release(flowlabel);
 	txopt_put(opt_to_free);
 	if (!err)
-- 
cgit v1.2.3


From 3024cf8248c49d9c57f56f6b650e8693f6e1bb57 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:24 -0700
Subject: bpf: Sync bpf.h to tools/

Sync new `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG`
attach types to tools/.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9b8c6e310e9a..cc68787f2d97 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -160,6 +160,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2363,6 +2365,12 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3


From 72481f398c4fc6ca32013310c3dec0ef3e6e5bf2 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:25 -0700
Subject: libbpf: Support guessing sendmsg{4,6} progs

libbpf can guess prog type and expected attach type based on section
name. Add hints for "cgroup/sendmsg4" and "cgroup/sendmsg6" section
names.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d20411ebfa2f..b1a60ac8424e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2043,6 +2043,8 @@ static const struct {
 	BPF_SA_PROG_SEC("cgroup/bind6",	BPF_CGROUP_INET6_BIND),
 	BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
 	BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT),
+	BPF_SA_PROG_SEC("cgroup/sendmsg4", BPF_CGROUP_UDP4_SENDMSG),
+	BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG),
 	BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
 	BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
 };
-- 
cgit v1.2.3


From 9be71aa6e56632195ef5b4ee20b87854262e6e85 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:26 -0700
Subject: selftests/bpf: Prepare test_sock_addr for extension

test_sock_addr was not easy to extend since it was focused on sys_bind
and sys_connect quite a bit.

Reorganized it so that it'll be easier to cover new test-cases for
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR`:

- decouple test-cases so that only one BPF prog is tested at a time;

- check programmatically that local IP:port for sys_bind, source IP and
  destination IP:port for sys_connect are rewritten property by tested
  BPF programs.

The output of new version:
  # test_sock_addr.sh 2>/dev/null
  Wait for testing IPv4/IPv6 to become available ... OK
  Test case: bind4: load prog with wrong expected attach type .. [PASS]
  Test case: bind4: attach prog with wrong attach type .. [PASS]
  Test case: bind4: rewrite IP & TCP port in .. [PASS]
  Test case: bind4: rewrite IP & UDP port in .. [PASS]
  Test case: bind6: load prog with wrong expected attach type .. [PASS]
  Test case: bind6: attach prog with wrong attach type .. [PASS]
  Test case: bind6: rewrite IP & TCP port in .. [PASS]
  Test case: bind6: rewrite IP & UDP port in .. [PASS]
  Test case: connect4: load prog with wrong expected attach type .. [PASS]
  Test case: connect4: attach prog with wrong attach type .. [PASS]
  Test case: connect4: rewrite IP & TCP port .. [PASS]
  Test case: connect4: rewrite IP & UDP port .. [PASS]
  Test case: connect6: load prog with wrong expected attach type .. [PASS]
  Test case: connect6: attach prog with wrong attach type .. [PASS]
  Test case: connect6: rewrite IP & TCP port .. [PASS]
  Test case: connect6: rewrite IP & UDP port .. [PASS]
  Summary: 16 PASSED, 0 FAILED

(stderr contains errors from libbpf when testing load/attach with
invalid arguments)

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sock_addr.c | 655 +++++++++++++++++++--------
 1 file changed, 460 insertions(+), 195 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 2950f80ba7fb..ed3e397a56f6 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -17,34 +17,292 @@
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
 
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
 #define CG_PATH	"/foo"
 #define CONNECT4_PROG_PATH	"./connect4_prog.o"
 #define CONNECT6_PROG_PATH	"./connect6_prog.o"
 
 #define SERV4_IP		"192.168.1.254"
 #define SERV4_REWRITE_IP	"127.0.0.1"
+#define SRC4_REWRITE_IP		"127.0.0.4"
 #define SERV4_PORT		4040
 #define SERV4_REWRITE_PORT	4444
 
 #define SERV6_IP		"face:b00c:1234:5678::abcd"
 #define SERV6_REWRITE_IP	"::1"
+#define SRC6_REWRITE_IP		"::6"
 #define SERV6_PORT		6060
 #define SERV6_REWRITE_PORT	6666
 
 #define INET_NTOP_BUF	40
 
-typedef int (*load_fn)(enum bpf_attach_type, const char *comment);
+struct sock_addr_test;
+
+typedef int (*load_fn)(const struct sock_addr_test *test);
 typedef int (*info_fn)(int, struct sockaddr *, socklen_t *);
 
-struct program {
-	enum bpf_attach_type type;
-	load_fn	loadfn;
-	int fd;
-	const char *name;
-	enum bpf_attach_type invalid_type;
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sock_addr_test {
+	const char *descr;
+	/* BPF prog properties */
+	load_fn loadfn;
+	enum bpf_attach_type expected_attach_type;
+	enum bpf_attach_type attach_type;
+	/* Socket properties */
+	int domain;
+	int type;
+	/* IP:port pairs for BPF prog to override */
+	const char *requested_ip;
+	unsigned short requested_port;
+	const char *expected_ip;
+	unsigned short expected_port;
+	const char *expected_src_ip;
+	/* Expected test result */
+	enum {
+		LOAD_REJECT,
+		ATTACH_REJECT,
+		SUCCESS,
+	} expected_result;
 };
 
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
+static int bind4_prog_load(const struct sock_addr_test *test);
+static int bind6_prog_load(const struct sock_addr_test *test);
+static int connect4_prog_load(const struct sock_addr_test *test);
+static int connect6_prog_load(const struct sock_addr_test *test);
+
+static struct sock_addr_test tests[] = {
+	/* bind */
+	{
+		"bind4: load prog with wrong expected attach type",
+		bind4_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"bind4: attach prog with wrong attach type",
+		bind4_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"bind4: rewrite IP & TCP port in",
+		bind4_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		"bind4: rewrite IP & UDP port in",
+		bind4_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		"bind6: load prog with wrong expected attach type",
+		bind6_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"bind6: attach prog with wrong attach type",
+		bind6_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"bind6: rewrite IP & TCP port in",
+		bind6_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		"bind6: rewrite IP & UDP port in",
+		bind6_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+
+	/* connect */
+	{
+		"connect4: load prog with wrong expected attach type",
+		connect4_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"connect4: attach prog with wrong attach type",
+		connect4_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"connect4: rewrite IP & TCP port",
+		connect4_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"connect4: rewrite IP & UDP port",
+		connect4_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"connect6: load prog with wrong expected attach type",
+		connect6_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET6,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"connect6: attach prog with wrong attach type",
+		connect6_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"connect6: rewrite IP & TCP port",
+		connect6_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"connect6: rewrite IP & UDP port",
+		connect6_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+};
 
 static int mk_sockaddr(int domain, const char *ip, unsigned short port,
 		       struct sockaddr *addr, socklen_t addr_len)
@@ -84,25 +342,23 @@ static int mk_sockaddr(int domain, const char *ip, unsigned short port,
 	return 0;
 }
 
-static int load_insns(enum bpf_attach_type attach_type,
-		      const struct bpf_insn *insns, size_t insns_cnt,
-		      const char *comment)
+static int load_insns(const struct sock_addr_test *test,
+		      const struct bpf_insn *insns, size_t insns_cnt)
 {
 	struct bpf_load_program_attr load_attr;
 	int ret;
 
 	memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
 	load_attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
-	load_attr.expected_attach_type = attach_type;
+	load_attr.expected_attach_type = test->expected_attach_type;
 	load_attr.insns = insns;
 	load_attr.insns_cnt = insns_cnt;
 	load_attr.license = "GPL";
 
 	ret = bpf_load_program_xattr(&load_attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
-	if (ret < 0 && comment) {
-		log_err(">>> Loading %s program error.\n"
-			">>> Output from verifier:\n%s\n-------\n",
-			comment, bpf_log_buf);
+	if (ret < 0 && test->expected_result != LOAD_REJECT) {
+		log_err(">>> Loading program error.\n"
+			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
 	}
 
 	return ret;
@@ -119,8 +375,7 @@ static int load_insns(enum bpf_attach_type attach_type,
  * to count jumps properly.
  */
 
-static int bind4_prog_load(enum bpf_attach_type attach_type,
-			   const char *comment)
+static int bind4_prog_load(const struct sock_addr_test *test)
 {
 	union {
 		uint8_t u4_addr8[4];
@@ -186,12 +441,10 @@ static int bind4_prog_load(enum bpf_attach_type attach_type,
 		BPF_EXIT_INSN(),
 	};
 
-	return load_insns(attach_type, insns,
-			  sizeof(insns) / sizeof(struct bpf_insn), comment);
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
 }
 
-static int bind6_prog_load(enum bpf_attach_type attach_type,
-			   const char *comment)
+static int bind6_prog_load(const struct sock_addr_test *test)
 {
 	struct sockaddr_in6 addr6_rw;
 	struct in6_addr ip6;
@@ -254,13 +507,10 @@ static int bind6_prog_load(enum bpf_attach_type attach_type,
 		BPF_EXIT_INSN(),
 	};
 
-	return load_insns(attach_type, insns,
-			  sizeof(insns) / sizeof(struct bpf_insn), comment);
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
 }
 
-static int connect_prog_load_path(const char *path,
-				  enum bpf_attach_type attach_type,
-				  const char *comment)
+static int load_path(const struct sock_addr_test *test, const char *path)
 {
 	struct bpf_prog_load_attr attr;
 	struct bpf_object *obj;
@@ -269,75 +519,83 @@ static int connect_prog_load_path(const char *path,
 	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
 	attr.file = path;
 	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
-	attr.expected_attach_type = attach_type;
+	attr.expected_attach_type = test->expected_attach_type;
 
 	if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) {
-		if (comment)
-			log_err(">>> Loading %s program at %s error.\n",
-				comment, path);
+		if (test->expected_result != LOAD_REJECT)
+			log_err(">>> Loading program (%s) error.\n", path);
 		return -1;
 	}
 
 	return prog_fd;
 }
 
-static int connect4_prog_load(enum bpf_attach_type attach_type,
-			      const char *comment)
+static int connect4_prog_load(const struct sock_addr_test *test)
 {
-	return connect_prog_load_path(CONNECT4_PROG_PATH, attach_type, comment);
+	return load_path(test, CONNECT4_PROG_PATH);
 }
 
-static int connect6_prog_load(enum bpf_attach_type attach_type,
-			      const char *comment)
+static int connect6_prog_load(const struct sock_addr_test *test)
 {
-	return connect_prog_load_path(CONNECT6_PROG_PATH, attach_type, comment);
+	return load_path(test, CONNECT6_PROG_PATH);
 }
 
-static void print_ip_port(int sockfd, info_fn fn, const char *fmt)
+static int cmp_addr(const struct sockaddr_storage *addr1,
+		    const struct sockaddr_storage *addr2, int cmp_port)
 {
-	char addr_buf[INET_NTOP_BUF];
-	struct sockaddr_storage addr;
-	struct sockaddr_in6 *addr6;
-	struct sockaddr_in *addr4;
-	socklen_t addr_len;
-	unsigned short port;
-	void *nip;
-
-	addr_len = sizeof(struct sockaddr_storage);
-	memset(&addr, 0, addr_len);
-
-	if (fn(sockfd, (struct sockaddr *)&addr, (socklen_t *)&addr_len) == 0) {
-		if (addr.ss_family == AF_INET) {
-			addr4 = (struct sockaddr_in *)&addr;
-			nip = (void *)&addr4->sin_addr;
-			port = ntohs(addr4->sin_port);
-		} else if (addr.ss_family == AF_INET6) {
-			addr6 = (struct sockaddr_in6 *)&addr;
-			nip = (void *)&addr6->sin6_addr;
-			port = ntohs(addr6->sin6_port);
-		} else {
-			return;
-		}
-		const char *addr_str =
-			inet_ntop(addr.ss_family, nip, addr_buf, INET_NTOP_BUF);
-		printf(fmt, addr_str ? addr_str : "??", port);
+	const struct sockaddr_in *four1, *four2;
+	const struct sockaddr_in6 *six1, *six2;
+
+	if (addr1->ss_family != addr2->ss_family)
+		return -1;
+
+	if (addr1->ss_family == AF_INET) {
+		four1 = (const struct sockaddr_in *)addr1;
+		four2 = (const struct sockaddr_in *)addr2;
+		return !((four1->sin_port == four2->sin_port || !cmp_port) &&
+			 four1->sin_addr.s_addr == four2->sin_addr.s_addr);
+	} else if (addr1->ss_family == AF_INET6) {
+		six1 = (const struct sockaddr_in6 *)addr1;
+		six2 = (const struct sockaddr_in6 *)addr2;
+		return !((six1->sin6_port == six2->sin6_port || !cmp_port) &&
+			 !memcmp(&six1->sin6_addr, &six2->sin6_addr,
+				 sizeof(struct in6_addr)));
 	}
+
+	return -1;
+}
+
+static int cmp_sock_addr(info_fn fn, int sock1,
+			 const struct sockaddr_storage *addr2, int cmp_port)
+{
+	struct sockaddr_storage addr1;
+	socklen_t len1 = sizeof(addr1);
+
+	memset(&addr1, 0, len1);
+	if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0)
+		return -1;
+
+	return cmp_addr(&addr1, addr2, cmp_port);
+}
+
+static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2)
+{
+	return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0);
 }
 
-static void print_local_ip_port(int sockfd, const char *fmt)
+static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2)
 {
-	print_ip_port(sockfd, getsockname, fmt);
+	return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1);
 }
 
-static void print_remote_ip_port(int sockfd, const char *fmt)
+static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2)
 {
-	print_ip_port(sockfd, getpeername, fmt);
+	return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1);
 }
 
 static int start_server(int type, const struct sockaddr_storage *addr,
 			socklen_t addr_len)
 {
-
 	int fd;
 
 	fd = socket(addr->ss_family, type, 0);
@@ -358,8 +616,6 @@ static int start_server(int type, const struct sockaddr_storage *addr,
 		}
 	}
 
-	print_local_ip_port(fd, "\t   Actual: bind(%s, %d)\n");
-
 	goto out;
 close_out:
 	close(fd);
@@ -372,19 +628,19 @@ static int connect_to_server(int type, const struct sockaddr_storage *addr,
 			     socklen_t addr_len)
 {
 	int domain;
-	int fd;
+	int fd = -1;
 
 	domain = addr->ss_family;
 
 	if (domain != AF_INET && domain != AF_INET6) {
 		log_err("Unsupported address family");
-		return -1;
+		goto err;
 	}
 
 	fd = socket(domain, type, 0);
 	if (fd == -1) {
-		log_err("Failed to creating client socket");
-		return -1;
+		log_err("Failed to create client socket");
+		goto err;
 	}
 
 	if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) {
@@ -392,162 +648,188 @@ static int connect_to_server(int type, const struct sockaddr_storage *addr,
 		goto err;
 	}
 
-	print_remote_ip_port(fd, "\t   Actual: connect(%s, %d)");
-	print_local_ip_port(fd, " from (%s, %d)\n");
-
-	return 0;
+	goto out;
 err:
 	close(fd);
-	return -1;
+	fd = -1;
+out:
+	return fd;
 }
 
-static void print_test_case_num(int domain, int type)
+static int init_addrs(const struct sock_addr_test *test,
+		      struct sockaddr_storage *requested_addr,
+		      struct sockaddr_storage *expected_addr,
+		      struct sockaddr_storage *expected_src_addr)
 {
-	static int test_num;
-
-	printf("Test case #%d (%s/%s):\n", ++test_num,
-	       (domain == AF_INET ? "IPv4" :
-		domain == AF_INET6 ? "IPv6" :
-		"unknown_domain"),
-	       (type == SOCK_STREAM ? "TCP" :
-		type == SOCK_DGRAM ? "UDP" :
-		"unknown_type"));
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+
+	if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port,
+			(struct sockaddr *)expected_addr, addr_len) == -1)
+		goto err;
+
+	if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port,
+			(struct sockaddr *)requested_addr, addr_len) == -1)
+		goto err;
+
+	if (test->expected_src_ip &&
+	    mk_sockaddr(test->domain, test->expected_src_ip, 0,
+			(struct sockaddr *)expected_src_addr, addr_len) == -1)
+		goto err;
+
+	return 0;
+err:
+	return -1;
 }
 
-static int run_test_case(int domain, int type, const char *ip,
-			 unsigned short port)
+static int run_bind_test_case(const struct sock_addr_test *test)
 {
-	struct sockaddr_storage addr;
-	socklen_t addr_len = sizeof(addr);
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage requested_addr;
+	struct sockaddr_storage expected_addr;
+	int clientfd = -1;
 	int servfd = -1;
 	int err = 0;
 
-	print_test_case_num(domain, type);
-
-	if (mk_sockaddr(domain, ip, port, (struct sockaddr *)&addr,
-			addr_len) == -1)
-		return -1;
+	if (init_addrs(test, &requested_addr, &expected_addr, NULL))
+		goto err;
 
-	printf("\tRequested: bind(%s, %d) ..\n", ip, port);
-	servfd = start_server(type, &addr, addr_len);
+	servfd = start_server(test->type, &requested_addr, addr_len);
 	if (servfd == -1)
 		goto err;
 
-	printf("\tRequested: connect(%s, %d) from (*, *) ..\n", ip, port);
-	if (connect_to_server(type, &addr, addr_len))
+	if (cmp_local_addr(servfd, &expected_addr))
+		goto err;
+
+	/* Try to connect to server just in case */
+	clientfd = connect_to_server(test->type, &expected_addr, addr_len);
+	if (clientfd == -1)
 		goto err;
 
 	goto out;
 err:
 	err = -1;
 out:
+	close(clientfd);
 	close(servfd);
 	return err;
 }
 
-static void close_progs_fds(struct program *progs, size_t prog_cnt)
+static int run_connect_test_case(const struct sock_addr_test *test)
 {
-	size_t i;
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage expected_src_addr;
+	struct sockaddr_storage requested_addr;
+	struct sockaddr_storage expected_addr;
+	int clientfd = -1;
+	int servfd = -1;
+	int err = 0;
 
-	for (i = 0; i < prog_cnt; ++i) {
-		close(progs[i].fd);
-		progs[i].fd = -1;
-	}
-}
+	if (init_addrs(test, &requested_addr, &expected_addr,
+		       &expected_src_addr))
+		goto err;
 
-static int load_and_attach_progs(int cgfd, struct program *progs,
-				 size_t prog_cnt)
-{
-	size_t i;
-
-	for (i = 0; i < prog_cnt; ++i) {
-		printf("Load %s with invalid type (can pollute stderr) ",
-		       progs[i].name);
-		fflush(stdout);
-		progs[i].fd = progs[i].loadfn(progs[i].invalid_type, NULL);
-		if (progs[i].fd != -1) {
-			log_err("Load with invalid type accepted for %s",
-				progs[i].name);
-			goto err;
-		}
-		printf("... REJECTED\n");
+	/* Prepare server to connect to */
+	servfd = start_server(test->type, &expected_addr, addr_len);
+	if (servfd == -1)
+		goto err;
 
-		printf("Load %s with valid type", progs[i].name);
-		progs[i].fd = progs[i].loadfn(progs[i].type, progs[i].name);
-		if (progs[i].fd == -1) {
-			log_err("Failed to load program %s", progs[i].name);
-			goto err;
-		}
-		printf(" ... OK\n");
-
-		printf("Attach %s with invalid type", progs[i].name);
-		if (bpf_prog_attach(progs[i].fd, cgfd, progs[i].invalid_type,
-				    BPF_F_ALLOW_OVERRIDE) != -1) {
-			log_err("Attach with invalid type accepted for %s",
-				progs[i].name);
-			goto err;
-		}
-		printf(" ... REJECTED\n");
+	clientfd = connect_to_server(test->type, &requested_addr, addr_len);
+	if (clientfd == -1)
+		goto err;
 
-		printf("Attach %s with valid type", progs[i].name);
-		if (bpf_prog_attach(progs[i].fd, cgfd, progs[i].type,
-				    BPF_F_ALLOW_OVERRIDE) == -1) {
-			log_err("Failed to attach program %s", progs[i].name);
-			goto err;
-		}
-		printf(" ... OK\n");
-	}
+	/* Make sure src and dst addrs were overridden properly */
+	if (cmp_peer_addr(clientfd, &expected_addr))
+		goto err;
 
-	return 0;
+	if (cmp_local_ip(clientfd, &expected_src_addr))
+		goto err;
+
+	goto out;
 err:
-	close_progs_fds(progs, prog_cnt);
-	return -1;
+	err = -1;
+out:
+	close(clientfd);
+	close(servfd);
+	return err;
 }
 
-static int run_domain_test(int domain, int cgfd, struct program *progs,
-			   size_t prog_cnt, const char *ip, unsigned short port)
+static int run_test_case(int cgfd, const struct sock_addr_test *test)
 {
+	int progfd = -1;
 	int err = 0;
 
-	if (load_and_attach_progs(cgfd, progs, prog_cnt) == -1)
+	printf("Test case: %s .. ", test->descr);
+
+	progfd = test->loadfn(test);
+	if (test->expected_result == LOAD_REJECT && progfd < 0)
+		goto out;
+	else if (test->expected_result == LOAD_REJECT || progfd < 0)
+		goto err;
+
+	err = bpf_prog_attach(progfd, cgfd, test->attach_type,
+			      BPF_F_ALLOW_OVERRIDE);
+	if (test->expected_result == ATTACH_REJECT && err) {
+		err = 0; /* error was expected, reset it */
+		goto out;
+	} else if (test->expected_result == ATTACH_REJECT || err) {
 		goto err;
+	}
 
-	if (run_test_case(domain, SOCK_STREAM, ip, port) == -1)
+	switch (test->attach_type) {
+	case BPF_CGROUP_INET4_BIND:
+	case BPF_CGROUP_INET6_BIND:
+		err = run_bind_test_case(test);
+		break;
+	case BPF_CGROUP_INET4_CONNECT:
+	case BPF_CGROUP_INET6_CONNECT:
+		err = run_connect_test_case(test);
+		break;
+	default:
 		goto err;
+	}
 
-	if (run_test_case(domain, SOCK_DGRAM, ip, port) == -1)
+	if (err || test->expected_result != SUCCESS)
 		goto err;
 
 	goto out;
 err:
 	err = -1;
 out:
-	close_progs_fds(progs, prog_cnt);
+	/* Detaching w/o checking return code: best effort attempt. */
+	if (progfd != -1)
+		bpf_prog_detach(cgfd, test->attach_type);
+	close(progfd);
+	printf("[%s]\n", err ? "FAIL" : "PASS");
 	return err;
 }
 
-static int run_test(void)
+static int run_tests(int cgfd)
+{
+	int passes = 0;
+	int fails = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+		if (run_test_case(cgfd, &tests[i]))
+			++fails;
+		else
+			++passes;
+	}
+	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+	return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
 {
-	size_t inet6_prog_cnt;
-	size_t inet_prog_cnt;
 	int cgfd = -1;
 	int err = 0;
 
-	struct program inet6_progs[] = {
-		{BPF_CGROUP_INET6_BIND, bind6_prog_load, -1, "bind6",
-		 BPF_CGROUP_INET4_BIND},
-		{BPF_CGROUP_INET6_CONNECT, connect6_prog_load, -1, "connect6",
-		 BPF_CGROUP_INET4_CONNECT},
-	};
-	inet6_prog_cnt = sizeof(inet6_progs) / sizeof(struct program);
-
-	struct program inet_progs[] = {
-		{BPF_CGROUP_INET4_BIND, bind4_prog_load, -1, "bind4",
-		 BPF_CGROUP_INET6_BIND},
-		{BPF_CGROUP_INET4_CONNECT, connect4_prog_load, -1, "connect4",
-		 BPF_CGROUP_INET6_CONNECT},
-	};
-	inet_prog_cnt = sizeof(inet_progs) / sizeof(struct program);
+	if (argc < 2) {
+		fprintf(stderr,
+			"%s has to be run via %s.sh. Skip direct run.\n",
+			argv[0], argv[0]);
+		exit(err);
+	}
 
 	if (setup_cgroup_environment())
 		goto err;
@@ -559,12 +841,7 @@ static int run_test(void)
 	if (join_cgroup(CG_PATH))
 		goto err;
 
-	if (run_domain_test(AF_INET, cgfd, inet_progs, inet_prog_cnt, SERV4_IP,
-			    SERV4_PORT) == -1)
-		goto err;
-
-	if (run_domain_test(AF_INET6, cgfd, inet6_progs, inet6_prog_cnt,
-			    SERV6_IP, SERV6_PORT) == -1)
+	if (run_tests(cgfd))
 		goto err;
 
 	goto out;
@@ -573,17 +850,5 @@ err:
 out:
 	close(cgfd);
 	cleanup_cgroup_environment();
-	printf(err ? "### FAIL\n" : "### SUCCESS\n");
 	return err;
 }
-
-int main(int argc, char **argv)
-{
-	if (argc < 2) {
-		fprintf(stderr,
-			"%s has to be run via %s.sh. Skip direct run.\n",
-			argv[0], argv[0]);
-		exit(0);
-	}
-	return run_test();
-}
-- 
cgit v1.2.3


From 04b6ab731209eac1e130fa00281a29278eca2f57 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:27 -0700
Subject: selftests/bpf: Selftest for sys_sendmsg hooks

Add selftest for BPF_CGROUP_UDP4_SENDMSG and BPF_CGROUP_UDP6_SENDMSG
attach types.

Try to sendmsg(2) to specific IP:port and test that:
* source IP is overridden as expected.
* remote IP:port pair is overridden as expected;

Both UDPv4 and UDPv6 are tested.

Output:
  # test_sock_addr.sh 2>/dev/null
  Wait for testing IPv4/IPv6 to become available ... OK
  ... pre-existing test-cases skipped ...
  Test case: sendmsg4: load prog with wrong expected attach type .. [PASS]
  Test case: sendmsg4: attach prog with wrong attach type .. [PASS]
  Test case: sendmsg4: rewrite IP & port (asm) .. [PASS]
  Test case: sendmsg4: rewrite IP & port (C) .. [PASS]
  Test case: sendmsg4: deny call .. [PASS]
  Test case: sendmsg6: load prog with wrong expected attach type .. [PASS]
  Test case: sendmsg6: attach prog with wrong attach type .. [PASS]
  Test case: sendmsg6: rewrite IP & port (asm) .. [PASS]
  Test case: sendmsg6: rewrite IP & port (C) .. [PASS]
  Test case: sendmsg6: IPv4-mapped IPv6 .. [PASS]
  Test case: sendmsg6: deny call .. [PASS]
  Summary: 27 PASSED, 0 FAILED

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile         |   2 +-
 tools/testing/selftests/bpf/sendmsg4_prog.c  |  49 +++
 tools/testing/selftests/bpf/sendmsg6_prog.c  |  60 ++++
 tools/testing/selftests/bpf/test_sock_addr.c | 518 +++++++++++++++++++++++++++
 4 files changed, 628 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/sendmsg4_prog.c
 create mode 100644 tools/testing/selftests/bpf/sendmsg6_prog.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 85044448bbc7..a1b66da965d0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/sendmsg4_prog.c b/tools/testing/selftests/bpf/sendmsg4_prog.c
new file mode 100644
index 000000000000..a91536b1c47e
--- /dev/null
+++ b/tools/testing/selftests/bpf/sendmsg4_prog.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define SRC1_IP4		0xAC100001U /* 172.16.0.1 */
+#define SRC2_IP4		0x00000000U
+#define SRC_REWRITE_IP4		0x7f000004U
+#define DST_IP4			0xC0A801FEU /* 192.168.1.254 */
+#define DST_REWRITE_IP4		0x7f000001U
+#define DST_PORT		4040
+#define DST_REWRITE_PORT4	4444
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg4")
+int sendmsg_v4_prog(struct bpf_sock_addr *ctx)
+{
+	if (ctx->type != SOCK_DGRAM)
+		return 0;
+
+	/* Rewrite source. */
+	if (ctx->msg_src_ip4 == bpf_htonl(SRC1_IP4) ||
+	    ctx->msg_src_ip4 == bpf_htonl(SRC2_IP4)) {
+		ctx->msg_src_ip4 = bpf_htonl(SRC_REWRITE_IP4);
+	} else {
+		/* Unexpected source. Reject sendmsg. */
+		return 0;
+	}
+
+	/* Rewrite destination. */
+	if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) &&
+	     ctx->user_port == bpf_htons(DST_PORT)) {
+		ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
+		ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
+	} else {
+		/* Unexpected source. Reject sendmsg. */
+		return 0;
+	}
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/sendmsg6_prog.c b/tools/testing/selftests/bpf/sendmsg6_prog.c
new file mode 100644
index 000000000000..5aeaa284fc47
--- /dev/null
+++ b/tools/testing/selftests/bpf/sendmsg6_prog.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define SRC_REWRITE_IP6_0	0
+#define SRC_REWRITE_IP6_1	0
+#define SRC_REWRITE_IP6_2	0
+#define SRC_REWRITE_IP6_3	6
+
+#define DST_REWRITE_IP6_0	0
+#define DST_REWRITE_IP6_1	0
+#define DST_REWRITE_IP6_2	0
+#define DST_REWRITE_IP6_3	1
+
+#define DST_REWRITE_PORT6	6666
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_prog(struct bpf_sock_addr *ctx)
+{
+	if (ctx->type != SOCK_DGRAM)
+		return 0;
+
+	/* Rewrite source. */
+	if (ctx->msg_src_ip6[3] == bpf_htonl(1) ||
+	    ctx->msg_src_ip6[3] == bpf_htonl(0)) {
+		ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0);
+		ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1);
+		ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2);
+		ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3);
+	} else {
+		/* Unexpected source. Reject sendmsg. */
+		return 0;
+	}
+
+	/* Rewrite destination. */
+	if ((ctx->user_ip6[0] & 0xFFFF) == bpf_htons(0xFACE) &&
+	     ctx->user_ip6[0] >> 16 == bpf_htons(0xB00C)) {
+		ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_0);
+		ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_1);
+		ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
+		ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_3);
+
+		ctx->user_port = bpf_htons(DST_REWRITE_PORT6);
+	} else {
+		/* Unexpected destination. Reject sendmsg. */
+		return 0;
+	}
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index ed3e397a56f6..a5e76b9219b9 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -1,12 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2018 Facebook
 
+#define _GNU_SOURCE
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 #include <arpa/inet.h>
+#include <netinet/in.h>
 #include <sys/types.h>
+#include <sys/select.h>
 #include <sys/socket.h>
 
 #include <linux/filter.h>
@@ -17,6 +21,10 @@
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
 
+#ifndef ENOTSUPP
+# define ENOTSUPP 524
+#endif
+
 #ifndef ARRAY_SIZE
 # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
@@ -24,15 +32,20 @@
 #define CG_PATH	"/foo"
 #define CONNECT4_PROG_PATH	"./connect4_prog.o"
 #define CONNECT6_PROG_PATH	"./connect6_prog.o"
+#define SENDMSG4_PROG_PATH	"./sendmsg4_prog.o"
+#define SENDMSG6_PROG_PATH	"./sendmsg6_prog.o"
 
 #define SERV4_IP		"192.168.1.254"
 #define SERV4_REWRITE_IP	"127.0.0.1"
+#define SRC4_IP			"172.16.0.1"
 #define SRC4_REWRITE_IP		"127.0.0.4"
 #define SERV4_PORT		4040
 #define SERV4_REWRITE_PORT	4444
 
 #define SERV6_IP		"face:b00c:1234:5678::abcd"
 #define SERV6_REWRITE_IP	"::1"
+#define SERV6_V4MAPPED_IP	"::ffff:192.168.0.4"
+#define SRC6_IP			"::1"
 #define SRC6_REWRITE_IP		"::6"
 #define SERV6_PORT		6060
 #define SERV6_REWRITE_PORT	6666
@@ -65,6 +78,8 @@ struct sock_addr_test {
 	enum {
 		LOAD_REJECT,
 		ATTACH_REJECT,
+		SYSCALL_EPERM,
+		SYSCALL_ENOTSUPP,
 		SUCCESS,
 	} expected_result;
 };
@@ -73,6 +88,12 @@ static int bind4_prog_load(const struct sock_addr_test *test);
 static int bind6_prog_load(const struct sock_addr_test *test);
 static int connect4_prog_load(const struct sock_addr_test *test);
 static int connect6_prog_load(const struct sock_addr_test *test);
+static int sendmsg_deny_prog_load(const struct sock_addr_test *test);
+static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test);
+static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test);
 
 static struct sock_addr_test tests[] = {
 	/* bind */
@@ -302,6 +323,162 @@ static struct sock_addr_test tests[] = {
 		SRC6_REWRITE_IP,
 		SUCCESS,
 	},
+
+	/* sendmsg */
+	{
+		"sendmsg4: load prog with wrong expected attach type",
+		sendmsg4_rw_asm_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"sendmsg4: attach prog with wrong attach type",
+		sendmsg4_rw_asm_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"sendmsg4: rewrite IP & port (asm)",
+		sendmsg4_rw_asm_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg4: rewrite IP & port (C)",
+		sendmsg4_rw_c_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg4: deny call",
+		sendmsg_deny_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		"sendmsg6: load prog with wrong expected attach type",
+		sendmsg6_rw_asm_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"sendmsg6: attach prog with wrong attach type",
+		sendmsg6_rw_asm_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"sendmsg6: rewrite IP & port (asm)",
+		sendmsg6_rw_asm_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg6: rewrite IP & port (C)",
+		sendmsg6_rw_c_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg6: IPv4-mapped IPv6",
+		sendmsg6_rw_v4mapped_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_ENOTSUPP,
+	},
+	{
+		"sendmsg6: deny call",
+		sendmsg_deny_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
 };
 
 static int mk_sockaddr(int domain, const char *ip, unsigned short port,
@@ -540,6 +717,141 @@ static int connect6_prog_load(const struct sock_addr_test *test)
 	return load_path(test, CONNECT6_PROG_PATH);
 }
 
+static int sendmsg_deny_prog_load(const struct sock_addr_test *test)
+{
+	struct bpf_insn insns[] = {
+		/* return 0 */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+	struct sockaddr_in dst4_rw_addr;
+	struct in_addr src4_rw_ip;
+
+	if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) {
+		log_err("Invalid IPv4: %s", SRC4_REWRITE_IP);
+		return -1;
+	}
+
+	if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT,
+			(struct sockaddr *)&dst4_rw_addr,
+			sizeof(dst4_rw_addr)) == -1)
+		return -1;
+
+	struct bpf_insn insns[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+		/* if (sk.family == AF_INET && */
+		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+			    offsetof(struct bpf_sock_addr, family)),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8),
+
+		/*     sk.type == SOCK_DGRAM)  { */
+		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+			    offsetof(struct bpf_sock_addr, type)),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6),
+
+		/*      msg_src_ip4 = src4_rw_ip */
+		BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, msg_src_ip4)),
+
+		/*      user_ip4 = dst4_rw_addr.sin_addr */
+		BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, user_ip4)),
+
+		/*      user_port = dst4_rw_addr.sin_port */
+		BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, user_port)),
+		/* } */
+
+		/* return 1 */
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+	};
+
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test)
+{
+	return load_path(test, SENDMSG4_PROG_PATH);
+}
+
+static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test,
+					 const char *rw_dst_ip)
+{
+	struct sockaddr_in6 dst6_rw_addr;
+	struct in6_addr src6_rw_ip;
+
+	if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) {
+		log_err("Invalid IPv6: %s", SRC6_REWRITE_IP);
+		return -1;
+	}
+
+	if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT,
+			(struct sockaddr *)&dst6_rw_addr,
+			sizeof(dst6_rw_addr)) == -1)
+		return -1;
+
+	struct bpf_insn insns[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+		/* if (sk.family == AF_INET6) { */
+		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+			    offsetof(struct bpf_sock_addr, family)),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18),
+
+#define STORE_IPV6_WORD_N(DST, SRC, N)					       \
+		BPF_MOV32_IMM(BPF_REG_7, SRC[N]),			       \
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,		       \
+			    offsetof(struct bpf_sock_addr, DST[N]))
+
+#define STORE_IPV6(DST, SRC)						       \
+		STORE_IPV6_WORD_N(DST, SRC, 0),				       \
+		STORE_IPV6_WORD_N(DST, SRC, 1),				       \
+		STORE_IPV6_WORD_N(DST, SRC, 2),				       \
+		STORE_IPV6_WORD_N(DST, SRC, 3)
+
+		STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32),
+		STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32),
+
+		/*      user_port = dst6_rw_addr.sin6_port */
+		BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, user_port)),
+
+		/* } */
+
+		/* return 1 */
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+	};
+
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+	return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP);
+}
+
+static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test)
+{
+	return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP);
+}
+
+static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test)
+{
+	return load_path(test, SENDMSG6_PROG_PATH);
+}
+
 static int cmp_addr(const struct sockaddr_storage *addr1,
 		    const struct sockaddr_storage *addr2, int cmp_port)
 {
@@ -656,6 +968,135 @@ out:
 	return fd;
 }
 
+int init_pktinfo(int domain, struct cmsghdr *cmsg)
+{
+	struct in6_pktinfo *pktinfo6;
+	struct in_pktinfo *pktinfo4;
+
+	if (domain == AF_INET) {
+		cmsg->cmsg_level = SOL_IP;
+		cmsg->cmsg_type = IP_PKTINFO;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
+		pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg);
+		memset(pktinfo4, 0, sizeof(struct in_pktinfo));
+		if (inet_pton(domain, SRC4_IP,
+			      (void *)&pktinfo4->ipi_spec_dst) != 1)
+			return -1;
+	} else if (domain == AF_INET6) {
+		cmsg->cmsg_level = SOL_IPV6;
+		cmsg->cmsg_type = IPV6_PKTINFO;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
+		pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+		memset(pktinfo6, 0, sizeof(struct in6_pktinfo));
+		if (inet_pton(domain, SRC6_IP,
+			      (void *)&pktinfo6->ipi6_addr) != 1)
+			return -1;
+	} else {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int sendmsg_to_server(const struct sockaddr_storage *addr,
+			     socklen_t addr_len, int set_cmsg, int *syscall_err)
+{
+	union {
+		char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))];
+		struct cmsghdr align;
+	} control6;
+	union {
+		char buf[CMSG_SPACE(sizeof(struct in_pktinfo))];
+		struct cmsghdr align;
+	} control4;
+	struct msghdr hdr;
+	struct iovec iov;
+	char data = 'a';
+	int domain;
+	int fd = -1;
+
+	domain = addr->ss_family;
+
+	if (domain != AF_INET && domain != AF_INET6) {
+		log_err("Unsupported address family");
+		goto err;
+	}
+
+	fd = socket(domain, SOCK_DGRAM, 0);
+	if (fd == -1) {
+		log_err("Failed to create client socket");
+		goto err;
+	}
+
+	memset(&iov, 0, sizeof(iov));
+	iov.iov_base = &data;
+	iov.iov_len = sizeof(data);
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.msg_name = (void *)addr;
+	hdr.msg_namelen = addr_len;
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+
+	if (set_cmsg) {
+		if (domain == AF_INET) {
+			hdr.msg_control = &control4;
+			hdr.msg_controllen = sizeof(control4.buf);
+		} else if (domain == AF_INET6) {
+			hdr.msg_control = &control6;
+			hdr.msg_controllen = sizeof(control6.buf);
+		}
+		if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) {
+			log_err("Fail to init pktinfo");
+			goto err;
+		}
+	}
+
+	if (sendmsg(fd, &hdr, 0) != sizeof(data)) {
+		log_err("Fail to send message to server");
+		*syscall_err = errno;
+		goto err;
+	}
+
+	goto out;
+err:
+	close(fd);
+	fd = -1;
+out:
+	return fd;
+}
+
+static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr)
+{
+	struct timeval tv;
+	struct msghdr hdr;
+	struct iovec iov;
+	char data[64];
+	fd_set rfds;
+
+	FD_ZERO(&rfds);
+	FD_SET(sockfd, &rfds);
+
+	tv.tv_sec = 2;
+	tv.tv_usec = 0;
+
+	if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 ||
+	    !FD_ISSET(sockfd, &rfds))
+		return -1;
+
+	memset(&iov, 0, sizeof(iov));
+	iov.iov_base = data;
+	iov.iov_len = sizeof(data);
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.msg_name = src_addr;
+	hdr.msg_namelen = sizeof(struct sockaddr_storage);
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+
+	return recvmsg(sockfd, &hdr, 0);
+}
+
 static int init_addrs(const struct sock_addr_test *test,
 		      struct sockaddr_storage *requested_addr,
 		      struct sockaddr_storage *expected_addr,
@@ -753,6 +1194,69 @@ out:
 	return err;
 }
 
+static int run_sendmsg_test_case(const struct sock_addr_test *test)
+{
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage expected_src_addr;
+	struct sockaddr_storage requested_addr;
+	struct sockaddr_storage expected_addr;
+	struct sockaddr_storage real_src_addr;
+	int clientfd = -1;
+	int servfd = -1;
+	int set_cmsg;
+	int err = 0;
+
+	if (test->type != SOCK_DGRAM)
+		goto err;
+
+	if (init_addrs(test, &requested_addr, &expected_addr,
+		       &expected_src_addr))
+		goto err;
+
+	/* Prepare server to sendmsg to */
+	servfd = start_server(test->type, &expected_addr, addr_len);
+	if (servfd == -1)
+		goto err;
+
+	for (set_cmsg = 0; set_cmsg <= 1; ++set_cmsg) {
+		if (clientfd >= 0)
+			close(clientfd);
+
+		clientfd = sendmsg_to_server(&requested_addr, addr_len,
+					     set_cmsg, &err);
+		if (err)
+			goto out;
+		else if (clientfd == -1)
+			goto err;
+
+		/* Try to receive message on server instead of using
+		 * getpeername(2) on client socket, to check that client's
+		 * destination address was rewritten properly, since
+		 * getpeername(2) doesn't work with unconnected datagram
+		 * sockets.
+		 *
+		 * Get source address from recvmsg(2) as well to make sure
+		 * source was rewritten properly: getsockname(2) can't be used
+		 * since socket is unconnected and source defined for one
+		 * specific packet may differ from the one used by default and
+		 * returned by getsockname(2).
+		 */
+		if (recvmsg_from_client(servfd, &real_src_addr) == -1)
+			goto err;
+
+		if (cmp_addr(&real_src_addr, &expected_src_addr, /*cmp_port*/0))
+			goto err;
+	}
+
+	goto out;
+err:
+	err = -1;
+out:
+	close(clientfd);
+	close(servfd);
+	return err;
+}
+
 static int run_test_case(int cgfd, const struct sock_addr_test *test)
 {
 	int progfd = -1;
@@ -784,10 +1288,24 @@ static int run_test_case(int cgfd, const struct sock_addr_test *test)
 	case BPF_CGROUP_INET6_CONNECT:
 		err = run_connect_test_case(test);
 		break;
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
+		err = run_sendmsg_test_case(test);
+		break;
 	default:
 		goto err;
 	}
 
+	if (test->expected_result == SYSCALL_EPERM && err == EPERM) {
+		err = 0; /* error was expected, reset it */
+		goto out;
+	}
+
+	if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) {
+		err = 0; /* error was expected, reset it */
+		goto out;
+	}
+
 	if (err || test->expected_result != SUCCESS)
 		goto err;
 
-- 
cgit v1.2.3


From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Tue, 29 May 2018 12:27:44 +0100
Subject: bpf: clean up eBPF helpers documentation

These are minor edits for the eBPF helpers documentation in
include/uapi/linux/bpf.h.

The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
with a non-escaped underscore that gets interpreted by rst2man and
produces the following message in the resulting manual page:

    DOCUTILS SYSTEM MESSAGES
           System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
                  Unknown target name: "bpf_fib_lookup".

Other edits consist in:

- Improving formatting for flag values for "bpf_fib_lookup()" helper.
- Emphasising a parameter name in description of the return value for
  "bpf_get_stack()" helper.
- Removing unnecessary blank lines between "Description" and "Return"
  sections for the few helpers that would use it, for consistency.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 21 ++++++++++-----------
 tools/include/uapi/linux/bpf.h | 21 ++++++++++-----------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
-- 
cgit v1.2.3


From 720e7f38bc3c59bf70974af326f7fb871a1ce89c Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 29 May 2018 10:40:18 +0800
Subject: bpf: hide the unused 'off' variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The local variable is only used while CONFIG_IPV6 enabled

net/core/filter.c: In function ‘sk_msg_convert_ctx_access’:
net/core/filter.c:6489:6: warning: unused variable ‘off’ [-Wunused-variable]
  int off;
      ^
This puts it into #ifdef.

Fixes: 303def35f64e ("bpf: allow sk_msg programs to read sock fields")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 24e6ce8be567..0ce93edefb0e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6486,7 +6486,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
 	int off;
+#endif
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
-- 
cgit v1.2.3


From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 29 May 2018 10:58:07 -0700
Subject: bpf: Drop mpls from bpf_fib_lookup

MPLS support will not be submitted this dev cycle, but in working on it
I do see a few changes are needed to the API. For now, drop mpls from the
API. Since the fields in question are unions, the mpls fields can be added
back later without affecting the uapi.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f556b35ac8d..33f37eb0b6bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1852,10 +1852,10 @@ union bpf_attr {
  *		If lookup is successful and result shows packet is to be
  *		forwarded, the neighbor tables are searched for the nexthop.
  *		If successful (ie., FIB lookup shows forwarding and nexthop
- *		is resolved), the nexthop address is returned in ipv4_dst,
- *		ipv6_dst or mpls_out based on family, smac is set to mac
- *		address of egress device, dmac is set to nexthop mac address,
- *		rt_metric is set to metric from route.
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only).
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-	/* input */
-	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
 
 	/* set if lookup is to consider L4 data - e.g., FIB rules */
 	__u8	l4_protocol;
@@ -2554,22 +2556,20 @@ struct bpf_fib_lookup {
 		__u8	tos;		/* AF_INET  */
 		__be32	flowlabel;	/* AF_INET6 */
 
-		/* output: metric of fib result */
-		__u32 rt_metric;
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
 	};
 
 	union {
-		__be32		mpls_in;
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
 	};
 
-	/* input to bpf_fib_lookup, *dst is destination address.
-	 * output: bpf_fib_lookup sets to gateway address
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
 	 */
 	union {
-		/* return for MPLS lookups */
-		__be32		mpls_out[4];  /* support up to 4 labels */
 		__be32		ipv4_dst;
 		__u32		ipv6_dst[4];  /* in6_addr; network order */
 	};
-- 
cgit v1.2.3


From 9ce64f192d161acff17c99ceec7d9ce3db9252fa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 29 May 2018 11:59:13 -0700
Subject: bpf: Verify flags in bpf_fib_lookup

Verify flags argument contains only known flags. Allows programs to probe
for support as more are added.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 0ce93edefb0e..81bd2e9fe8fc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4270,6 +4270,9 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
+	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+		return -EINVAL;
+
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
@@ -4304,6 +4307,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
+	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+		return -EINVAL;
+
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-- 
cgit v1.2.3


From 170a7e3ea0709eae12c8f944b9f33c54fe80c6c1 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:08 +0100
Subject: bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not
 found

This makes is it possible for bpf prog detach to return -ENOENT.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c        | 11 +++++++++--
 kernel/trace/bpf_trace.c |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b574dddc05b8..527587de8a67 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1616,6 +1616,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	int new_prog_cnt, carry_prog_cnt = 0;
 	struct bpf_prog **existing_prog;
 	struct bpf_prog_array *array;
+	bool found_exclude = false;
 	int new_prog_idx = 0;
 
 	/* Figure out how many existing progs we need to carry over to
@@ -1624,14 +1625,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	if (old_array) {
 		existing_prog = old_array->progs;
 		for (; *existing_prog; existing_prog++) {
-			if (*existing_prog != exclude_prog &&
-			    *existing_prog != &dummy_bpf_prog.prog)
+			if (*existing_prog == exclude_prog) {
+				found_exclude = true;
+				continue;
+			}
+			if (*existing_prog != &dummy_bpf_prog.prog)
 				carry_prog_cnt++;
 			if (*existing_prog == include_prog)
 				return -EEXIST;
 		}
 	}
 
+	if (exclude_prog && !found_exclude)
+		return -ENOENT;
+
 	/* How many progs (not NULL) will be in the new array? */
 	new_prog_cnt = carry_prog_cnt;
 	if (include_prog)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 81fdf2fc94ac..af1486d9a0ed 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1006,6 +1006,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 
 	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret == -ENOENT)
+		goto unlock;
 	if (ret < 0) {
 		bpf_prog_array_delete_safe(old_array, event->prog);
 	} else {
-- 
cgit v1.2.3


From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:09 +0100
Subject: media: rc: introduce BPF_PROG_LIRC_MODE2

Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
that the last key should be repeated.

The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
the target_fd must be the /dev/lircN device.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/media/rc/Kconfig        |  13 ++
 drivers/media/rc/Makefile       |   1 +
 drivers/media/rc/bpf-lirc.c     | 313 ++++++++++++++++++++++++++++++++++++++++
 drivers/media/rc/lirc_dev.c     |  30 ++++
 drivers/media/rc/rc-core-priv.h |  21 +++
 drivers/media/rc/rc-ir-raw.c    |  12 +-
 include/linux/bpf_lirc.h        |  29 ++++
 include/linux/bpf_types.h       |   3 +
 include/uapi/linux/bpf.h        |  53 ++++++-
 kernel/bpf/syscall.c            |   7 +
 10 files changed, 479 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/bpf-lirc.c
 create mode 100644 include/linux/bpf_lirc.h

diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index eb2c3b6eca7f..d5b35a6ba899 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -25,6 +25,19 @@ config LIRC
 	   passes raw IR to and from userspace, which is needed for
 	   IR transmitting (aka "blasting") and for the lirc daemon.
 
+config BPF_LIRC_MODE2
+	bool "Support for eBPF programs attached to lirc devices"
+	depends on BPF_SYSCALL
+	depends on RC_CORE=y
+	depends on LIRC
+	help
+	   Allow attaching eBPF programs to a lirc device using the bpf(2)
+	   syscall command BPF_PROG_ATTACH. This is supported for raw IR
+	   receivers.
+
+	   These eBPF programs can be used to decode IR into scancodes, for
+	   IR protocols not supported by the kernel decoders.
+
 menuconfig RC_DECODERS
 	bool "Remote controller decoders"
 	depends on RC_CORE
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 2e1c87066f6c..e0340d043fe8 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -5,6 +5,7 @@ obj-y += keymaps/
 obj-$(CONFIG_RC_CORE) += rc-core.o
 rc-core-y := rc-main.o rc-ir-raw.o
 rc-core-$(CONFIG_LIRC) += lirc_dev.o
+rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o
 obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
 obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
 obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
new file mode 100644
index 000000000000..40826bba06b6
--- /dev/null
+++ b/drivers/media/rc/bpf-lirc.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+// bpf-lirc.c - handles bpf
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_lirc.h>
+#include "rc-core-priv.h"
+
+/*
+ * BPF interface for raw IR
+ */
+const struct bpf_prog_ops lirc_mode2_prog_ops = {
+};
+
+BPF_CALL_1(bpf_rc_repeat, u32*, sample)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_repeat(ctrl->dev);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_repeat_proto = {
+	.func	   = bpf_rc_repeat,
+	.gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+};
+
+/*
+ * Currently rc-core does not support 64-bit scancodes, but there are many
+ * known protocols with more than 32 bits. So, define the interface as u64
+ * as a future-proof.
+ */
+BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode,
+	   u32, toggle)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_keydown_proto = {
+	.func	   = bpf_rc_keydown,
+	.gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_rc_repeat:
+		return &rc_repeat_proto;
+	case BPF_FUNC_rc_keydown:
+		return &rc_keydown_proto;
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+		/* fall through */
+	default:
+		return NULL;
+	}
+}
+
+static bool lirc_mode2_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	/* We have one field of u32 */
+	return type == BPF_READ && off == 0 && size == sizeof(u32);
+}
+
+const struct bpf_verifier_ops lirc_mode2_verifier_ops = {
+	.get_func_proto  = lirc_mode2_func_proto,
+	.is_valid_access = lirc_mode2_is_valid_access
+};
+
+#define BPF_MAX_PROGS 64
+
+static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) {
+		ret = -E2BIG;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+	/*
+	 * Do not use bpf_prog_array_delete_safe() as we would end up
+	 * with a dummy entry in the array, and the we would free the
+	 * dummy in lirc_bpf_free()
+	 */
+	if (ret)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
+{
+	struct ir_raw_event_ctrl *raw = rcdev->raw;
+
+	raw->bpf_sample = sample;
+
+	if (raw->progs)
+		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN);
+}
+
+/*
+ * This should be called once the rc thread has been stopped, so there can be
+ * no concurrent bpf execution.
+ */
+void lirc_bpf_free(struct rc_dev *rcdev)
+{
+	struct bpf_prog **progs;
+
+	if (!rcdev->raw->progs)
+		return;
+
+	progs = rcu_dereference(rcdev->raw->progs)->progs;
+	while (*progs)
+		bpf_prog_put(*progs++);
+
+	bpf_prog_array_free(rcdev->raw->progs);
+}
+
+int lirc_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_attach(rcdev, prog);
+	if (ret)
+		bpf_prog_put(prog);
+
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_detach(rcdev, prog);
+
+	bpf_prog_put(prog);
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array __rcu *progs;
+	struct rc_dev *rcdev;
+	u32 cnt, flags = 0;
+	int ret;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	rcdev = rc_dev_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(rcdev))
+		return PTR_ERR(rcdev);
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW) {
+		ret = -EINVAL;
+		goto put;
+	}
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		goto put;
+
+	progs = rcdev->raw->progs;
+	cnt = progs ? bpf_prog_array_length(progs) : 0;
+
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+		ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+put:
+	put_device(&rcdev->dev);
+
+	return ret;
+}
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 24e9fbb80e81..da7013a12a58 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/idr.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -104,6 +105,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev)
 			TO_US(ev.duration), TO_STR(ev.pulse));
 	}
 
+	/*
+	 * bpf does not care about the gap generated above; that exists
+	 * for backwards compatibility
+	 */
+	lirc_bpf_run(dev, sample);
+
 	spin_lock_irqsave(&dev->lirc_fh_lock, flags);
 	list_for_each_entry(fh, &dev->lirc_fh, list) {
 		if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports)
@@ -816,4 +823,27 @@ void __exit lirc_dev_exit(void)
 	unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX);
 }
 
+struct rc_dev *rc_dev_get_from_fd(int fd)
+{
+	struct fd f = fdget(fd);
+	struct lirc_fh *fh;
+	struct rc_dev *dev;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &lirc_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fh = f.file->private_data;
+	dev = fh->rc;
+
+	get_device(&dev->dev);
+	fdput(f);
+
+	return dev;
+}
+
 MODULE_ALIAS("lirc_dev");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index e0e6a17460f6..eb004757038b 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -13,6 +13,7 @@
 #define	MAX_IR_EVENT_SIZE	512
 
 #include <linux/slab.h>
+#include <uapi/linux/bpf.h>
 #include <media/rc-core.h>
 
 /**
@@ -57,6 +58,11 @@ struct ir_raw_event_ctrl {
 	/* raw decoder state follows */
 	struct ir_raw_event prev_ev;
 	struct ir_raw_event this_ev;
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+	u32				bpf_sample;
+	struct bpf_prog_array __rcu	*progs;
+#endif
 	struct nec_dec {
 		int state;
 		unsigned count;
@@ -126,6 +132,9 @@ struct ir_raw_event_ctrl {
 	} imon;
 };
 
+/* Mutex for locking raw IR processing and handler change */
+extern struct mutex ir_raw_handler_lock;
+
 /* macros for IR decoders */
 static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin)
 {
@@ -288,6 +297,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev);
 void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc);
 int ir_lirc_register(struct rc_dev *dev);
 void ir_lirc_unregister(struct rc_dev *dev);
+struct rc_dev *rc_dev_get_from_fd(int fd);
 #else
 static inline int lirc_dev_init(void) { return 0; }
 static inline void lirc_dev_exit(void) {}
@@ -299,4 +309,15 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; }
 static inline void ir_lirc_unregister(struct rc_dev *dev) { }
 #endif
 
+/*
+ * bpf interface
+ */
+#ifdef CONFIG_BPF_LIRC_MODE2
+void lirc_bpf_free(struct rc_dev *dev);
+void lirc_bpf_run(struct rc_dev *dev, u32 sample);
+#else
+static inline void lirc_bpf_free(struct rc_dev *dev) { }
+static inline void lirc_bpf_run(struct rc_dev *dev, u32 sample) { }
+#endif
+
 #endif /* _RC_CORE_PRIV */
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 374f83105a23..7675b7ee5bc7 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -14,7 +14,7 @@
 static LIST_HEAD(ir_raw_client_list);
 
 /* Used to handle IR raw handler extensions */
-static DEFINE_MUTEX(ir_raw_handler_lock);
+DEFINE_MUTEX(ir_raw_handler_lock);
 static LIST_HEAD(ir_raw_handler_list);
 static atomic64_t available_protocols = ATOMIC64_INIT(0);
 
@@ -621,9 +621,17 @@ void ir_raw_event_unregister(struct rc_dev *dev)
 	list_for_each_entry(handler, &ir_raw_handler_list, list)
 		if (handler->raw_unregister)
 			handler->raw_unregister(dev);
-	mutex_unlock(&ir_raw_handler_lock);
+
+	lirc_bpf_free(dev);
 
 	ir_raw_event_free(dev);
+
+	/*
+	 * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so
+	 * ensure that the raw member is null on unlock; this is how
+	 * "device gone" is checked.
+	 */
+	mutex_unlock(&ir_raw_handler_lock);
 }
 
 /*
diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h
new file mode 100644
index 000000000000..5f8a4283092d
--- /dev/null
+++ b/include/linux/bpf_lirc.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_LIRC_H
+#define _BPF_LIRC_H
+
+#include <uapi/linux/bpf.h>
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+int lirc_prog_attach(const union bpf_attr *attr);
+int lirc_prog_detach(const union bpf_attr *attr);
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int lirc_prog_attach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_detach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_query(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* _BPF_LIRC_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b161e506dcfc..c5700c2d5549 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_BPF_LIRC_MODE2
+BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 33f37eb0b6bf..64ac0f7a689e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,6 +143,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
 };
 
 enum bpf_attach_type {
@@ -162,6 +163,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_POST_BIND,
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2005,6 +2007,53 @@ union bpf_attr {
  * 		direct packet access.
  *	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2083,7 +2132,9 @@ union bpf_attr {
 	FN(lwt_push_encap),		\
 	FN(lwt_seg6_store_bytes),	\
 	FN(lwt_seg6_adjust_srh),	\
-	FN(lwt_seg6_action),
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e254526d6744..7365d79ae00d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
 #include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
@@ -1582,6 +1583,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_attach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1654,6 +1657,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1703,6 +1708,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
+	case BPF_LIRC_MODE2:
+		return lirc_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 6bdd533cee9aadbcd476af30bfff079abe68fcdb Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:10 +0100
Subject: bpf: add selftest for lirc_mode2 type program

This is simple test over rc-loopback.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/prog.c                           |   1 +
 tools/include/uapi/linux/bpf.h                     |  79 ++++++--
 tools/include/uapi/linux/lirc.h                    | 217 +++++++++++++++++++++
 tools/lib/bpf/libbpf.c                             |   1 +
 tools/testing/selftests/bpf/.gitignore             |   1 +
 tools/testing/selftests/bpf/Makefile               |   7 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |   5 +
 tools/testing/selftests/bpf/test_lirc_mode2.sh     |  28 +++
 tools/testing/selftests/bpf/test_lirc_mode2_kern.c |  23 +++
 tools/testing/selftests/bpf/test_lirc_mode2_user.c | 149 ++++++++++++++
 10 files changed, 494 insertions(+), 17 deletions(-)
 create mode 100644 tools/include/uapi/linux/lirc.h
 create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
 create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 39b88e760367..a4f435203fef 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SK_MSG]		= "sk_msg",
 	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
 	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
+	[BPF_PROG_TYPE_LIRC_MODE2]	= "lirc_mode2",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3f556b35ac8d..64ac0f7a689e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -143,6 +143,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
 };
 
 enum bpf_attach_type {
@@ -162,6 +163,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_POST_BIND,
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1852,10 +1854,10 @@ union bpf_attr {
  *		If lookup is successful and result shows packet is to be
  *		forwarded, the neighbor tables are searched for the nexthop.
  *		If successful (ie., FIB lookup shows forwarding and nexthop
- *		is resolved), the nexthop address is returned in ipv4_dst,
- *		ipv6_dst or mpls_out based on family, smac is set to mac
- *		address of egress device, dmac is set to nexthop mac address,
- *		rt_metric is set to metric from route.
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only).
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -2005,6 +2007,53 @@ union bpf_attr {
  * 		direct packet access.
  *	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2083,7 +2132,9 @@ union bpf_attr {
 	FN(lwt_push_encap),		\
 	FN(lwt_seg6_store_bytes),	\
 	FN(lwt_seg6_adjust_srh),	\
-	FN(lwt_seg6_action),
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2537,8 +2588,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-	/* input */
-	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
 
 	/* set if lookup is to consider L4 data - e.g., FIB rules */
 	__u8	l4_protocol;
@@ -2554,22 +2607,20 @@ struct bpf_fib_lookup {
 		__u8	tos;		/* AF_INET  */
 		__be32	flowlabel;	/* AF_INET6 */
 
-		/* output: metric of fib result */
-		__u32 rt_metric;
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
 	};
 
 	union {
-		__be32		mpls_in;
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
 	};
 
-	/* input to bpf_fib_lookup, *dst is destination address.
-	 * output: bpf_fib_lookup sets to gateway address
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
 	 */
 	union {
-		/* return for MPLS lookups */
-		__be32		mpls_out[4];  /* support up to 4 labels */
 		__be32		ipv4_dst;
 		__u32		ipv6_dst[4];  /* in6_addr; network order */
 	};
diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h
new file mode 100644
index 000000000000..f189931042a7
--- /dev/null
+++ b/tools/include/uapi/linux/lirc.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * lirc.h - linux infrared remote control header file
+ * last modified 2010/07/13 by Jarod Wilson
+ */
+
+#ifndef _LINUX_LIRC_H
+#define _LINUX_LIRC_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define PULSE_BIT       0x01000000
+#define PULSE_MASK      0x00FFFFFF
+
+#define LIRC_MODE2_SPACE     0x00000000
+#define LIRC_MODE2_PULSE     0x01000000
+#define LIRC_MODE2_FREQUENCY 0x02000000
+#define LIRC_MODE2_TIMEOUT   0x03000000
+
+#define LIRC_VALUE_MASK      0x00FFFFFF
+#define LIRC_MODE2_MASK      0xFF000000
+
+#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE)
+#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE)
+#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY)
+#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT)
+
+#define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK)
+#define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK)
+
+#define LIRC_IS_SPACE(val) (LIRC_MODE2(val) == LIRC_MODE2_SPACE)
+#define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE)
+#define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY)
+#define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT)
+
+/* used heavily by lirc userspace */
+#define lirc_t int
+
+/*** lirc compatible hardware features ***/
+
+#define LIRC_MODE2SEND(x) (x)
+#define LIRC_SEND2MODE(x) (x)
+#define LIRC_MODE2REC(x) ((x) << 16)
+#define LIRC_REC2MODE(x) ((x) >> 16)
+
+#define LIRC_MODE_RAW                  0x00000001
+#define LIRC_MODE_PULSE                0x00000002
+#define LIRC_MODE_MODE2                0x00000004
+#define LIRC_MODE_SCANCODE             0x00000008
+#define LIRC_MODE_LIRCCODE             0x00000010
+
+
+#define LIRC_CAN_SEND_RAW              LIRC_MODE2SEND(LIRC_MODE_RAW)
+#define LIRC_CAN_SEND_PULSE            LIRC_MODE2SEND(LIRC_MODE_PULSE)
+#define LIRC_CAN_SEND_MODE2            LIRC_MODE2SEND(LIRC_MODE_MODE2)
+#define LIRC_CAN_SEND_LIRCCODE         LIRC_MODE2SEND(LIRC_MODE_LIRCCODE)
+
+#define LIRC_CAN_SEND_MASK             0x0000003f
+
+#define LIRC_CAN_SET_SEND_CARRIER      0x00000100
+#define LIRC_CAN_SET_SEND_DUTY_CYCLE   0x00000200
+#define LIRC_CAN_SET_TRANSMITTER_MASK  0x00000400
+
+#define LIRC_CAN_REC_RAW               LIRC_MODE2REC(LIRC_MODE_RAW)
+#define LIRC_CAN_REC_PULSE             LIRC_MODE2REC(LIRC_MODE_PULSE)
+#define LIRC_CAN_REC_MODE2             LIRC_MODE2REC(LIRC_MODE_MODE2)
+#define LIRC_CAN_REC_SCANCODE          LIRC_MODE2REC(LIRC_MODE_SCANCODE)
+#define LIRC_CAN_REC_LIRCCODE          LIRC_MODE2REC(LIRC_MODE_LIRCCODE)
+
+#define LIRC_CAN_REC_MASK              LIRC_MODE2REC(LIRC_CAN_SEND_MASK)
+
+#define LIRC_CAN_SET_REC_CARRIER       (LIRC_CAN_SET_SEND_CARRIER << 16)
+#define LIRC_CAN_SET_REC_DUTY_CYCLE    (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16)
+
+#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000
+#define LIRC_CAN_SET_REC_CARRIER_RANGE    0x80000000
+#define LIRC_CAN_GET_REC_RESOLUTION       0x20000000
+#define LIRC_CAN_SET_REC_TIMEOUT          0x10000000
+#define LIRC_CAN_SET_REC_FILTER           0x08000000
+
+#define LIRC_CAN_MEASURE_CARRIER          0x02000000
+#define LIRC_CAN_USE_WIDEBAND_RECEIVER    0x04000000
+
+#define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK)
+#define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK)
+
+#define LIRC_CAN_NOTIFY_DECODE            0x01000000
+
+/*** IOCTL commands for lirc driver ***/
+
+#define LIRC_GET_FEATURES              _IOR('i', 0x00000000, __u32)
+
+#define LIRC_GET_SEND_MODE             _IOR('i', 0x00000001, __u32)
+#define LIRC_GET_REC_MODE              _IOR('i', 0x00000002, __u32)
+#define LIRC_GET_REC_RESOLUTION        _IOR('i', 0x00000007, __u32)
+
+#define LIRC_GET_MIN_TIMEOUT           _IOR('i', 0x00000008, __u32)
+#define LIRC_GET_MAX_TIMEOUT           _IOR('i', 0x00000009, __u32)
+
+/* code length in bits, currently only for LIRC_MODE_LIRCCODE */
+#define LIRC_GET_LENGTH                _IOR('i', 0x0000000f, __u32)
+
+#define LIRC_SET_SEND_MODE             _IOW('i', 0x00000011, __u32)
+#define LIRC_SET_REC_MODE              _IOW('i', 0x00000012, __u32)
+/* Note: these can reset the according pulse_width */
+#define LIRC_SET_SEND_CARRIER          _IOW('i', 0x00000013, __u32)
+#define LIRC_SET_REC_CARRIER           _IOW('i', 0x00000014, __u32)
+#define LIRC_SET_SEND_DUTY_CYCLE       _IOW('i', 0x00000015, __u32)
+#define LIRC_SET_TRANSMITTER_MASK      _IOW('i', 0x00000017, __u32)
+
+/*
+ * when a timeout != 0 is set the driver will send a
+ * LIRC_MODE2_TIMEOUT data packet, otherwise LIRC_MODE2_TIMEOUT is
+ * never sent, timeout is disabled by default
+ */
+#define LIRC_SET_REC_TIMEOUT           _IOW('i', 0x00000018, __u32)
+
+/* 1 enables, 0 disables timeout reports in MODE2 */
+#define LIRC_SET_REC_TIMEOUT_REPORTS   _IOW('i', 0x00000019, __u32)
+
+/*
+ * if enabled from the next key press on the driver will send
+ * LIRC_MODE2_FREQUENCY packets
+ */
+#define LIRC_SET_MEASURE_CARRIER_MODE	_IOW('i', 0x0000001d, __u32)
+
+/*
+ * to set a range use LIRC_SET_REC_CARRIER_RANGE with the
+ * lower bound first and later LIRC_SET_REC_CARRIER with the upper bound
+ */
+#define LIRC_SET_REC_CARRIER_RANGE     _IOW('i', 0x0000001f, __u32)
+
+#define LIRC_SET_WIDEBAND_RECEIVER     _IOW('i', 0x00000023, __u32)
+
+/*
+ * struct lirc_scancode - decoded scancode with protocol for use with
+ *	LIRC_MODE_SCANCODE
+ *
+ * @timestamp: Timestamp in nanoseconds using CLOCK_MONOTONIC when IR
+ *	was decoded.
+ * @flags: should be 0 for transmit. When receiving scancodes,
+ *	LIRC_SCANCODE_FLAG_TOGGLE or LIRC_SCANCODE_FLAG_REPEAT can be set
+ *	depending on the protocol
+ * @rc_proto: see enum rc_proto
+ * @keycode: the translated keycode. Set to 0 for transmit.
+ * @scancode: the scancode received or to be sent
+ */
+struct lirc_scancode {
+	__u64	timestamp;
+	__u16	flags;
+	__u16	rc_proto;
+	__u32	keycode;
+	__u64	scancode;
+};
+
+/* Set if the toggle bit of rc-5 or rc-6 is enabled */
+#define LIRC_SCANCODE_FLAG_TOGGLE	1
+/* Set if this is a nec or sanyo repeat */
+#define LIRC_SCANCODE_FLAG_REPEAT	2
+
+/**
+ * enum rc_proto - the Remote Controller protocol
+ *
+ * @RC_PROTO_UNKNOWN: Protocol not known
+ * @RC_PROTO_OTHER: Protocol known but proprietary
+ * @RC_PROTO_RC5: Philips RC5 protocol
+ * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol
+ * @RC_PROTO_RC5_SZ: StreamZap variant of RC5
+ * @RC_PROTO_JVC: JVC protocol
+ * @RC_PROTO_SONY12: Sony 12 bit protocol
+ * @RC_PROTO_SONY15: Sony 15 bit protocol
+ * @RC_PROTO_SONY20: Sony 20 bit protocol
+ * @RC_PROTO_NEC: NEC protocol
+ * @RC_PROTO_NECX: Extended NEC protocol
+ * @RC_PROTO_NEC32: NEC 32 bit protocol
+ * @RC_PROTO_SANYO: Sanyo protocol
+ * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard
+ * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse
+ * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol
+ * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol
+ * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol
+ * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol
+ * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol
+ * @RC_PROTO_SHARP: Sharp protocol
+ * @RC_PROTO_XMP: XMP protocol
+ * @RC_PROTO_CEC: CEC protocol
+ * @RC_PROTO_IMON: iMon Pad protocol
+ */
+enum rc_proto {
+	RC_PROTO_UNKNOWN	= 0,
+	RC_PROTO_OTHER		= 1,
+	RC_PROTO_RC5		= 2,
+	RC_PROTO_RC5X_20	= 3,
+	RC_PROTO_RC5_SZ		= 4,
+	RC_PROTO_JVC		= 5,
+	RC_PROTO_SONY12		= 6,
+	RC_PROTO_SONY15		= 7,
+	RC_PROTO_SONY20		= 8,
+	RC_PROTO_NEC		= 9,
+	RC_PROTO_NECX		= 10,
+	RC_PROTO_NEC32		= 11,
+	RC_PROTO_SANYO		= 12,
+	RC_PROTO_MCIR2_KBD	= 13,
+	RC_PROTO_MCIR2_MSE	= 14,
+	RC_PROTO_RC6_0		= 15,
+	RC_PROTO_RC6_6A_20	= 16,
+	RC_PROTO_RC6_6A_24	= 17,
+	RC_PROTO_RC6_6A_32	= 18,
+	RC_PROTO_RC6_MCE	= 19,
+	RC_PROTO_SHARP		= 20,
+	RC_PROTO_XMP		= 21,
+	RC_PROTO_CEC		= 22,
+	RC_PROTO_IMON		= 23,
+};
+
+#endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b1a60ac8424e..a1e96b5de5ff 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1462,6 +1462,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_SK_MSG:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_LIRC_MODE2:
 		return false;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_KPROBE:
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index adc8e5474b66..6ea835982464 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -17,3 +17,4 @@ test_sock_addr
 urandom_read
 test_btf
 test_sockmap
+test_lirc_mode2_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a1b66da965d0..553d1816b77a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_btf test_sockmap
+	test_sock test_btf test_sockmap test_lirc_mode2_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -44,7 +44,8 @@ TEST_PROGS := test_kmod.sh \
 	test_offload.py \
 	test_sock_addr.sh \
 	test_tunnel.sh \
-	test_lwt_seg6local.sh
+	test_lwt_seg6local.sh \
+	test_lirc_mode2.sh
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 334d3e8c5e89..a66a9d91acf4 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -126,6 +126,11 @@ static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param,
 static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset,
 				      unsigned int len) =
 	(void *) BPF_FUNC_lwt_seg6_adjust_srh;
+static int (*bpf_rc_repeat)(void *ctx) =
+	(void *) BPF_FUNC_rc_repeat;
+static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol,
+			     unsigned long long scancode, unsigned int toggle) =
+	(void *) BPF_FUNC_rc_keydown;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2.sh b/tools/testing/selftests/bpf/test_lirc_mode2.sh
new file mode 100755
index 000000000000..ce2e15e4f976
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+modprobe rc-loopback
+
+for i in /sys/class/rc/rc*
+do
+	if grep -q DRV_NAME=rc-loopback $i/uevent
+	then
+		LIRCDEV=$(grep DEVNAME= $i/lirc*/uevent | sed sQDEVNAME=Q/dev/Q)
+	fi
+done
+
+if [ -n $LIRCDEV ];
+then
+	TYPE=lirc_mode2
+	./test_lirc_mode2_user $LIRCDEV
+	ret=$?
+	if [ $ret -ne 0 ]; then
+		echo -e ${RED}"FAIL: $TYPE"${NC}
+	else
+		echo -e ${GREEN}"PASS: $TYPE"${NC}
+	fi
+fi
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
new file mode 100644
index 000000000000..ba26855563a5
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include "bpf_helpers.h"
+
+SEC("lirc_mode2")
+int bpf_decoder(unsigned int *sample)
+{
+	if (LIRC_IS_PULSE(*sample)) {
+		unsigned int duration = LIRC_VALUE(*sample);
+
+		if (duration & 0x10000)
+			bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
new file mode 100644
index 000000000000..d470d63c33db
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+// A lirc chardev is a device representing a consumer IR (cir) device which
+// can receive infrared signals from remote control and/or transmit IR.
+//
+// IR is sent as a series of pulses and space somewhat like morse code. The
+// BPF program can decode this into scancodes so that rc-core can translate
+// this into input key codes using the rc keymap.
+//
+// This test works by sending IR over rc-loopback, so the IR is processed by
+// BPF and then decoded into scancodes. The lirc chardev must be the one
+// associated with rc-loopback, see the output of ir-keytable(1).
+//
+// The following CONFIG options must be enabled for the test to succeed:
+// CONFIG_RC_CORE=y
+// CONFIG_BPF_RAWIR_EVENT=y
+// CONFIG_RC_LOOPBACK=y
+
+// Steps:
+// 1. Open the /dev/lircN device for rc-loopback (given on command line)
+// 2. Attach bpf_lirc_mode2 program which decodes some IR.
+// 3. Send some IR to the same IR device; since it is loopback, this will
+//    end up in the bpf program
+// 4. bpf program should decode IR and report keycode
+// 5. We can read keycode from same /dev/lirc device
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+int main(int argc, char **argv)
+{
+	struct bpf_object *obj;
+	int ret, lircfd, progfd, mode;
+	int testir = 0x1dead;
+	u32 prog_ids[10], prog_flags[10], prog_cnt;
+
+	if (argc != 2) {
+		printf("Usage: %s /dev/lircN\n", argv[0]);
+		return 2;
+	}
+
+	ret = bpf_prog_load("test_lirc_mode2_kern.o",
+			    BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd);
+	if (ret) {
+		printf("Failed to load bpf program\n");
+		return 1;
+	}
+
+	lircfd = open(argv[1], O_RDWR | O_NONBLOCK);
+	if (lircfd == -1) {
+		printf("failed to open lirc device %s: %m\n", argv[1]);
+		return 1;
+	}
+
+	/* Let's try detach it before it was ever attached */
+	ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2);
+	if (ret != -1 || errno != ENOENT) {
+		printf("bpf_prog_detach2 not attached should fail: %m\n");
+		return 1;
+	}
+
+	mode = LIRC_MODE_SCANCODE;
+	if (ioctl(lircfd, LIRC_SET_REC_MODE, &mode)) {
+		printf("failed to set rec mode: %m\n");
+		return 1;
+	}
+
+	prog_cnt = 10;
+	ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
+			     &prog_cnt);
+	if (ret) {
+		printf("Failed to query bpf programs on lirc device: %m\n");
+		return 1;
+	}
+
+	if (prog_cnt != 0) {
+		printf("Expected nothing to be attached\n");
+		return 1;
+	}
+
+	ret = bpf_prog_attach(progfd, lircfd, BPF_LIRC_MODE2, 0);
+	if (ret) {
+		printf("Failed to attach bpf to lirc device: %m\n");
+		return 1;
+	}
+
+	/* Write raw IR */
+	ret = write(lircfd, &testir, sizeof(testir));
+	if (ret != sizeof(testir)) {
+		printf("Failed to send test IR message: %m\n");
+		return 1;
+	}
+
+	struct pollfd pfd = { .fd = lircfd, .events = POLLIN };
+	struct lirc_scancode lsc;
+
+	poll(&pfd, 1, 100);
+
+	/* Read decoded IR */
+	ret = read(lircfd, &lsc, sizeof(lsc));
+	if (ret != sizeof(lsc)) {
+		printf("Failed to read decoded IR: %m\n");
+		return 1;
+	}
+
+	if (lsc.scancode != 0xdead || lsc.rc_proto != 64) {
+		printf("Incorrect scancode decoded\n");
+		return 1;
+	}
+
+	prog_cnt = 10;
+	ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
+			     &prog_cnt);
+	if (ret) {
+		printf("Failed to query bpf programs on lirc device: %m\n");
+		return 1;
+	}
+
+	if (prog_cnt != 1) {
+		printf("Expected one program to be attached\n");
+		return 1;
+	}
+
+	/* Let's try detaching it now it is actually attached */
+	ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2);
+	if (ret) {
+		printf("bpf_prog_detach2: returned %m\n");
+		return 1;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 13a370b9d275959ac75e92dc14e43eeae75804f8 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 29 May 2018 13:29:31 -0700
Subject: bpftool: Support sendmsg{4,6} attach types

Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
and bash completion.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 9 +++++++--
 tools/bpf/bpftool/bash-completion/bpftool          | 5 +++--
 tools/bpf/bpftool/cgroup.c                         | 4 +++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index d004f6382dab..7b0e6d453e92 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -27,7 +27,8 @@ MAP COMMANDS
 |
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
-|		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** }
+|		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
+|               **sendmsg4** | **sendmsg6** }
 |	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -70,7 +71,11 @@ DESCRIPTION
 		  **post_bind4** return from bind(2) for an inet4 socket (since 4.17);
 		  **post_bind6** return from bind(2) for an inet6 socket (since 4.17);
 		  **connect4** call to connect(2) for an inet4 socket (since 4.17);
-		  **connect6** call to connect(2) for an inet6 socket (since 4.17).
+		  **connect6** call to connect(2) for an inet6 socket (since 4.17);
+		  **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an
+		  unconnected udp4 socket (since 4.18);
+		  **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an
+		  unconnected udp6 socket (since 4.18).
 
 	**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
 		  Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 7bc198d60de2..1e1083321643 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -407,7 +407,7 @@ _bpftool()
                 attach|detach)
                     local ATTACH_TYPES='ingress egress sock_create sock_ops \
                         device bind4 bind6 post_bind4 post_bind6 connect4 \
-                        connect6'
+                        connect6 sendmsg4 sendmsg6'
                     local ATTACH_FLAGS='multi override'
                     local PROG_TYPE='id pinned tag'
                     case $prev in
@@ -416,7 +416,8 @@ _bpftool()
                             return 0
                             ;;
                         ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
-                        post_bind4|post_bind6|connect4|connect6)
+                        post_bind4|post_bind6|connect4|connect6|sendmsg4|\
+                        sendmsg6)
                             COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
                                 "$cur" ) )
                             return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 1351bd6b5aa7..16bee011e16c 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -20,7 +20,7 @@
 	"       ATTACH_TYPE := { ingress | egress | sock_create |\n"	       \
 	"                        sock_ops | device | bind4 | bind6 |\n"	       \
 	"                        post_bind4 | post_bind6 | connect4 |\n"       \
-	"                        connect6 }"
+	"                        connect6 | sendmsg4 | sendmsg6 }"
 
 static const char * const attach_type_strings[] = {
 	[BPF_CGROUP_INET_INGRESS] = "ingress",
@@ -34,6 +34,8 @@ static const char * const attach_type_strings[] = {
 	[BPF_CGROUP_INET6_CONNECT] = "connect6",
 	[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
 	[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+	[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
+	[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
 };
 
-- 
cgit v1.2.3


From 71b2c87df3ac37f5f83e166db136b0c1d065a781 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 30 May 2018 16:09:16 +0100
Subject: bpf: devmap: remove redundant assignment of dev = dev

The assignment dev = dev is redundant and should be removed.

Detected by CoverityScan, CID#1469486 ("Evaluation order violation")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ae16d0c373ef..1fe3fe60508a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -352,7 +352,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
-	struct net_device *dev = dev = obj ? obj->dev : NULL;
+	struct net_device *dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
-- 
cgit v1.2.3


From bcece5dc40b9e00d0a9a91fc0acc9a825c23362b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 30 May 2018 12:24:17 -0700
Subject: bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported
 address families

Update bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address
families. Allows userspace to probe for support as more are added
(e.g., AF_MPLS).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 81bd2e9fe8fc..885fb0e2f264 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4285,7 +4285,7 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 					   flags, true);
 #endif
 	}
-	return 0;
+	return -EAFNOSUPPORT;
 }
 
 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
@@ -4302,7 +4302,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	struct net *net = dev_net(skb->dev);
-	int index = 0;
+	int index = -EAFNOSUPPORT;
 
 	if (plen < sizeof(*params))
 		return -EINVAL;
-- 
cgit v1.2.3


From 16edddfe3c5d192f097c27ff60cae05daa29f782 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:36 +0900
Subject: selftests/bpf: test_sockmap, check test failure

Test failures are not identified because exit code of RX/TX threads
is not checked. Also threads are not returning correct exit code.

- Return exit code from threads depending on test execution status
- In main thread, check the exit code of RX/TX threads
- Skip error checking for corked tests as they are expected to timeout

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index eb17fae458e6..7b2008a144cb 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -429,8 +429,8 @@ static int sendmsg_test(struct sockmap_options *opt)
 	struct msg_stats s = {0};
 	int iov_count = opt->iov_count;
 	int iov_buf = opt->iov_length;
+	int rx_status, tx_status;
 	int cnt = opt->rate;
-	int status;
 
 	errno = 0;
 
@@ -442,7 +442,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 	rxpid = fork();
 	if (rxpid == 0) {
 		if (opt->drop_expected)
-			exit(1);
+			exit(0);
 
 		if (opt->sendpage)
 			iov_count = 1;
@@ -463,7 +463,9 @@ static int sendmsg_test(struct sockmap_options *opt)
 				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
+		if (err && txmsg_cork)
+			err = 0;
+		exit(err ? 1 : 0);
 	} else if (rxpid == -1) {
 		perror("msg_loop_rx: ");
 		return errno;
@@ -491,14 +493,27 @@ static int sendmsg_test(struct sockmap_options *opt)
 				"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
+		exit(err ? 1 : 0);
 	} else if (txpid == -1) {
 		perror("msg_loop_tx: ");
 		return errno;
 	}
 
-	assert(waitpid(rxpid, &status, 0) == rxpid);
-	assert(waitpid(txpid, &status, 0) == txpid);
+	assert(waitpid(rxpid, &rx_status, 0) == rxpid);
+	assert(waitpid(txpid, &tx_status, 0) == txpid);
+	if (WIFEXITED(rx_status)) {
+		err = WEXITSTATUS(rx_status);
+		if (err) {
+			fprintf(stderr, "rx thread exited with err %d. ", err);
+			goto out;
+		}
+	}
+	if (WIFEXITED(tx_status)) {
+		err = WEXITSTATUS(tx_status);
+		if (err)
+			fprintf(stderr, "tx thread exited with err %d. ", err);
+	}
+out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 035b37ff2c6e3577b8ad7d5c3cc4619ab74559bb Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:37 +0900
Subject: selftests/bpf: test_sockmap, join cgroup in selftest mode

In case of selftest mode, temporary cgroup environment is created but
cgroup is not joined. It causes test failures. Fixed by joining the
cgroup

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 7b2008a144cb..7f9ca79aadbc 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1344,6 +1344,11 @@ static int __test_suite(char *bpf_file)
 		return cg_fd;
 	}
 
+	if (join_cgroup(CG_PATH)) {
+		fprintf(stderr, "ERROR: failed to join cgroup\n");
+		return -EINVAL;
+	}
+
 	/* Tests basic commands and APIs with range of iov values */
 	txmsg_start = txmsg_end = 0;
 	err = test_txmsg(cg_fd);
-- 
cgit v1.2.3


From a009f1f396d04a700971586ce8a142ea606cbb61 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:38 +0900
Subject: selftests/bpf: test_sockmap, timing improvements

Currently 10us delay is too low for many tests to succeed. It needs to
be increased. Also, many corked tests are expected to hit rx timeout
irrespective of timeout value.

- This patch sets 1000usec timeout value for corked tests because less
than that causes broken-pipe error in tx thread. Also sets 1 second
timeout for all other tests because less than that results in RX
timeout
- tests with apply=1 and higher number of iterations were taking lot
of time. This patch reduces test run time by reducing iterations.

real    0m12.968s
user    0m0.219s
sys     0m14.337s

Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 7f9ca79aadbc..5cd0550af595 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -345,8 +345,13 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		if (err < 0)
 			perror("recv start time: ");
 		while (s->bytes_recvd < total_bytes) {
-			timeout.tv_sec = 0;
-			timeout.tv_usec = 10;
+			if (txmsg_cork) {
+				timeout.tv_sec = 0;
+				timeout.tv_usec = 1000;
+			} else {
+				timeout.tv_sec = 1;
+				timeout.tv_usec = 0;
+			}
 
 			/* FD sets */
 			FD_ZERO(&w);
@@ -1025,14 +1030,14 @@ static int test_send(struct sockmap_options *opt, int cgrp)
 
 	opt->iov_length = 1;
 	opt->iov_count = 1;
-	opt->rate = 1024;
+	opt->rate = 512;
 	err = test_exec(cgrp, opt);
 	if (err)
 		goto out;
 
 	opt->iov_length = 256;
 	opt->iov_count = 1024;
-	opt->rate = 10;
+	opt->rate = 2;
 	err = test_exec(cgrp, opt);
 	if (err)
 		goto out;
-- 
cgit v1.2.3


From d825e12f084650649b61209d0cd514c0e57bb0de Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:39 +0900
Subject: selftests/bpf: test_sockmap, fix data verification

When data verification is enabled, some tests fail because verification is done
incorrectly. Following changes fix it.

- Identify the size of data block to be verified
- Reset verification counter when data block size is reached
- Fixed the value printed in case of verfication failure

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 5cd0550af595..2bc70e1ab2eb 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -337,8 +337,15 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
 		float total_bytes;
+		int bytes_cnt = 0;
+		int chunk_sz;
 		fd_set w;
 
+		if (opt->sendpage)
+			chunk_sz = iov_length * cnt;
+		else
+			chunk_sz = iov_length * iov_count;
+
 		fcntl(fd, fd_flags);
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -393,9 +400,14 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 							errno = -EIO;
 							fprintf(stderr,
 								"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-								i, j, d[j], k - 1, d[j+1], k + 1);
+								i, j, d[j], k - 1, d[j+1], k);
 							goto out_errno;
 						}
+						bytes_cnt++;
+						if (bytes_cnt == chunk_sz) {
+							k = 0;
+							bytes_cnt = 0;
+						}
 						recv--;
 					}
 				}
-- 
cgit v1.2.3


From 73563aa3d9773bc70a2f171643cce18617a0cc6f Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:40 +0900
Subject: selftests/bpf: test_sockmap, print additional test options

Print values of test options like apply, cork, start, end so that
individual failed tests can be identified for manual run

Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 2bc70e1ab2eb..05c8cb71724a 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -876,6 +876,8 @@ static char *test_to_str(int test)
 #define OPTSTRING 60
 static void test_options(char *options)
 {
+	char tstr[OPTSTRING];
+
 	memset(options, 0, OPTSTRING);
 
 	if (txmsg_pass)
@@ -888,14 +890,22 @@ static void test_options(char *options)
 		strncat(options, "redir_noisy,", OPTSTRING);
 	if (txmsg_drop)
 		strncat(options, "drop,", OPTSTRING);
-	if (txmsg_apply)
-		strncat(options, "apply,", OPTSTRING);
-	if (txmsg_cork)
-		strncat(options, "cork,", OPTSTRING);
-	if (txmsg_start)
-		strncat(options, "start,", OPTSTRING);
-	if (txmsg_end)
-		strncat(options, "end,", OPTSTRING);
+	if (txmsg_apply) {
+		snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply);
+		strncat(options, tstr, OPTSTRING);
+	}
+	if (txmsg_cork) {
+		snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork);
+		strncat(options, tstr, OPTSTRING);
+	}
+	if (txmsg_start) {
+		snprintf(tstr, OPTSTRING, "start %d,", txmsg_start);
+		strncat(options, tstr, OPTSTRING);
+	}
+	if (txmsg_end) {
+		snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
+		strncat(options, tstr, OPTSTRING);
+	}
 	if (txmsg_ingress)
 		strncat(options, "ingress,", OPTSTRING);
 	if (txmsg_skb)
@@ -904,7 +914,7 @@ static void test_options(char *options)
 
 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
 {
-	char *options = calloc(60, sizeof(char));
+	char *options = calloc(OPTSTRING, sizeof(char));
 	int err;
 
 	if (test == SENDPAGE)
-- 
cgit v1.2.3


From b9308ae696b2c35e862636eec631d95ff958c33d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 2 Jun 2018 09:06:50 -0700
Subject: bpf: btf: Check array t->size

This patch ensures array's t->size is 0.

The array size is decided by its individual elem's size and the
number of elements.  Hence, t->size is not used and
it must be 0.

A test case is added to test_btf.c

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c                       |  5 +++++
 tools/testing/selftests/bpf/test_btf.c | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3d20aa1f4b54..84ad532f2854 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1342,6 +1342,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (t->size) {
+		btf_verifier_log_type(env, t, "size != 0");
+		return -EINVAL;
+	}
+
 	/* Array elem type and index type cannot be in type void,
 	 * so !array->type and !array->index_type are not allowed.
 	 */
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 35064df688c1..fd8246e84149 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1178,6 +1178,29 @@ static struct btf_raw_test raw_tests[] = {
 	.err_str = "Invalid index",
 },
 
+{
+	.descr = "array test. t->size != 0\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 1),
+		BTF_ARRAY_ENC(1, 1, 16),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "size != 0",
+},
+
 {
 	.descr = "int test. invalid int_data",
 	.raw_types = {
-- 
cgit v1.2.3


From 8175383f2320dbc1b4e803d857ed499ed3e76199 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 2 Jun 2018 09:06:51 -0700
Subject: bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD

The t->type in BTF_KIND_FWD is not used.  It must be 0.
This patch ensures that and also adds a test case in test_btf.c

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c                       | 21 ++++++++++++++++++++-
 tools/testing/selftests/bpf/test_btf.c | 22 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 84ad532f2854..8653ab004c73 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1286,8 +1286,27 @@ static struct btf_kind_operations ptr_ops = {
 	.seq_show = btf_ptr_seq_show,
 };
 
+static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (t->type) {
+		btf_verifier_log_type(env, t, "type != 0");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
 static struct btf_kind_operations fwd_ops = {
-	.check_meta = btf_ref_type_check_meta,
+	.check_meta = btf_fwd_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index fd8246e84149..3619f3023088 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1242,6 +1242,28 @@ static struct btf_raw_test raw_tests[] = {
 	.err_str = "Invalid btf_info",
 },
 
+{
+	.descr = "fwd test. t->type != 0\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* fwd type */				/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 1),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "fwd_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "type != 0",
+},
+
 }; /* struct btf_raw_test raw_tests[] */
 
 static const char *get_next_str(const char *start, const char *end)
-- 
cgit v1.2.3


From 06be0864c77ae6861632a678a6378511a4828d6e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:31 +0200
Subject: bpf: test case for map pointer poison with calls/branches

Add several test cases where the same or different map pointers
originate from different paths in the program and execute a map
lookup or tail call at a common location.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h                      |  10 ++
 tools/include/linux/filter.h                |  10 ++
 tools/testing/selftests/bpf/test_verifier.c | 185 ++++++++++++++++++++++++----
 3 files changed, 178 insertions(+), 27 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d90abdaea94b..6fd375fe7079 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -289,6 +289,16 @@ struct xdp_buff;
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Relative call */
+
+#define BPF_CALL_REL(TGT)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.dst_reg = 0,					\
+		.src_reg = BPF_PSEUDO_CALL,			\
+		.off   = 0,					\
+		.imm   = TGT })
+
 /* Function call */
 
 #define BPF_EMIT_CALL(FUNC)					\
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index c5e512da8d8a..af55acf73e75 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -263,6 +263,16 @@
 #define BPF_LD_MAP_FD(DST, MAP_FD)				\
 	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
 
+/* Relative call */
+
+#define BPF_CALL_REL(TGT)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.dst_reg = 0,					\
+		.src_reg = BPF_PSEUDO_CALL,			\
+		.off   = 0,					\
+		.imm   = TGT })
+
 /* Program exit */
 
 #define BPF_EXIT_INSN()						\
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4b4f015be217..7cb1d74057ce 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -50,7 +50,7 @@
 
 #define MAX_INSNS	BPF_MAXINSNS
 #define MAX_FIXUPS	8
-#define MAX_NR_MAPS	4
+#define MAX_NR_MAPS	7
 #define POINTER_VALUE	0xcafe4all
 #define TEST_DATA_LEN	64
 
@@ -66,7 +66,9 @@ struct bpf_test {
 	int fixup_map1[MAX_FIXUPS];
 	int fixup_map2[MAX_FIXUPS];
 	int fixup_map3[MAX_FIXUPS];
-	int fixup_prog[MAX_FIXUPS];
+	int fixup_map4[MAX_FIXUPS];
+	int fixup_prog1[MAX_FIXUPS];
+	int fixup_prog2[MAX_FIXUPS];
 	int fixup_map_in_map[MAX_FIXUPS];
 	const char *errstr;
 	const char *errstr_unpriv;
@@ -2769,7 +2771,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.errstr_unpriv = "R3 leaks addr into helper",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
@@ -2856,7 +2858,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 42,
 	},
@@ -2870,7 +2872,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 41,
 	},
@@ -2884,7 +2886,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 1,
 	},
@@ -2898,7 +2900,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 2,
 	},
@@ -2912,7 +2914,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 2,
 	},
@@ -2926,7 +2928,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 2 },
+		.fixup_prog1 = { 2 },
 		.result = ACCEPT,
 		.retval = 42,
 	},
@@ -11681,6 +11683,112 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"calls: two calls returning different map pointers for lookup (hash, array)",
+		.insns = {
+			/* main prog */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+			BPF_CALL_REL(11),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_CALL_REL(12),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+				   offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			/* subprog 1 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* subprog 2 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map2 = { 13 },
+		.fixup_map4 = { 16 },
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"calls: two calls returning different map pointers for lookup (hash, map in map)",
+		.insns = {
+			/* main prog */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+			BPF_CALL_REL(11),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_CALL_REL(12),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+				   offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			/* subprog 1 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* subprog 2 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map_in_map = { 16 },
+		.fixup_map4 = { 13 },
+		.result = REJECT,
+		.errstr = "R0 invalid mem access 'map_ptr'",
+	},
+	{
+		"cond: two branches returning different map pointers for lookup (tail, tail)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 3),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 5 },
+		.fixup_prog2 = { 2 },
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "tail_call abusing map_ptr",
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"cond: two branches returning same map pointers for lookup (tail, tail)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 3),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog2 = { 2, 5 },
+		.result_unpriv = ACCEPT,
+		.result = ACCEPT,
+		.retval = 42,
+	},
 	{
 		"search pruning: all branches should be verified (nop operation)",
 		.insns = {
@@ -12162,12 +12270,13 @@ static int probe_filter_length(const struct bpf_insn *fp)
 	return len + 1;
 }
 
-static int create_map(uint32_t size_value, uint32_t max_elem)
+static int create_map(uint32_t type, uint32_t size_key,
+		      uint32_t size_value, uint32_t max_elem)
 {
 	int fd;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
-			    size_value, max_elem, BPF_F_NO_PREALLOC);
+	fd = bpf_create_map(type, size_key, size_value, max_elem,
+			    type == BPF_MAP_TYPE_HASH ? BPF_F_NO_PREALLOC : 0);
 	if (fd < 0)
 		printf("Failed to create hash map '%s'!\n", strerror(errno));
 
@@ -12200,13 +12309,13 @@ static int create_prog_dummy2(int mfd, int idx)
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_array(void)
+static int create_prog_array(uint32_t max_elem, int p1key)
 {
-	int p1key = 0, p2key = 1;
+	int p2key = 1;
 	int mfd, p1fd, p2fd;
 
 	mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int),
-			     sizeof(int), 4, 0);
+			     sizeof(int), max_elem, 0);
 	if (mfd < 0) {
 		printf("Failed to create prog array '%s'!\n", strerror(errno));
 		return -1;
@@ -12261,7 +12370,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	int *fixup_map1 = test->fixup_map1;
 	int *fixup_map2 = test->fixup_map2;
 	int *fixup_map3 = test->fixup_map3;
-	int *fixup_prog = test->fixup_prog;
+	int *fixup_map4 = test->fixup_map4;
+	int *fixup_prog1 = test->fixup_prog1;
+	int *fixup_prog2 = test->fixup_prog2;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 
 	if (test->fill_helper)
@@ -12272,7 +12383,8 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	 * that really matters is value size in this case.
 	 */
 	if (*fixup_map1) {
-		map_fds[0] = create_map(sizeof(long long), 1);
+		map_fds[0] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(long long), 1);
 		do {
 			prog[*fixup_map1].imm = map_fds[0];
 			fixup_map1++;
@@ -12280,7 +12392,8 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_map2) {
-		map_fds[1] = create_map(sizeof(struct test_val), 1);
+		map_fds[1] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(struct test_val), 1);
 		do {
 			prog[*fixup_map2].imm = map_fds[1];
 			fixup_map2++;
@@ -12288,25 +12401,43 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_map3) {
-		map_fds[1] = create_map(sizeof(struct other_val), 1);
+		map_fds[2] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(struct other_val), 1);
 		do {
-			prog[*fixup_map3].imm = map_fds[1];
+			prog[*fixup_map3].imm = map_fds[2];
 			fixup_map3++;
 		} while (*fixup_map3);
 	}
 
-	if (*fixup_prog) {
-		map_fds[2] = create_prog_array();
+	if (*fixup_map4) {
+		map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+					sizeof(struct test_val), 1);
+		do {
+			prog[*fixup_map4].imm = map_fds[3];
+			fixup_map4++;
+		} while (*fixup_map4);
+	}
+
+	if (*fixup_prog1) {
+		map_fds[4] = create_prog_array(4, 0);
+		do {
+			prog[*fixup_prog1].imm = map_fds[4];
+			fixup_prog1++;
+		} while (*fixup_prog1);
+	}
+
+	if (*fixup_prog2) {
+		map_fds[5] = create_prog_array(8, 7);
 		do {
-			prog[*fixup_prog].imm = map_fds[2];
-			fixup_prog++;
-		} while (*fixup_prog);
+			prog[*fixup_prog2].imm = map_fds[5];
+			fixup_prog2++;
+		} while (*fixup_prog2);
 	}
 
 	if (*fixup_map_in_map) {
-		map_fds[3] = create_map_in_map();
+		map_fds[6] = create_map_in_map();
 		do {
-			prog[*fixup_map_in_map].imm = map_fds[3];
+			prog[*fixup_map_in_map].imm = map_fds[6];
 			fixup_map_in_map++;
 		} while (*fixup_map_in_map);
 	}
-- 
cgit v1.2.3


From be08815c5d3b25e53cd9b53a4d768d5f3d93ba25 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:32 +0200
Subject: bpf: add also cbpf long jump test cases with heavy expansion

We have one triggering on eBPF but lets also add a cBPF example to
make sure we keep tracking them. Also add anther cBPF test running
max number of MSH ops.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 lib/test_bpf.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 317f231462d4..60aedc879361 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -356,6 +356,52 @@ static int bpf_fill_maxinsns11(struct bpf_test *self)
 	return __bpf_fill_ja(self, BPF_MAXINSNS, 68);
 }
 
+static int bpf_fill_maxinsns12(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i = 0;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	insn[0] = __BPF_JUMP(BPF_JMP | BPF_JA, len - 2, 0, 0);
+
+	for (i = 1; i < len - 1; i++)
+		insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
+
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_K, 0xabababab);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns13(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i = 0;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len - 3; i++)
+		insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
+
+	insn[len - 3] = __BPF_STMT(BPF_LD | BPF_IMM, 0xabababab);
+	insn[len - 2] = __BPF_STMT(BPF_ALU | BPF_XOR | BPF_X, 0);
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
 static int bpf_fill_ja(struct bpf_test *self)
 {
 	/* Hits exactly 11 passes on x86_64 JIT. */
@@ -5289,6 +5335,23 @@ static struct bpf_test tests[] = {
 		.fill_helper = bpf_fill_maxinsns11,
 		.expected_errcode = -ENOTSUPP,
 	},
+	{
+		"BPF_MAXINSNS: jump over MSH",
+		{ },
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		{ 0xfa, 0xfb, 0xfc, 0xfd, },
+		{ { 4, 0xabababab } },
+		.fill_helper = bpf_fill_maxinsns12,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"BPF_MAXINSNS: exec all MSH",
+		{ },
+		CLASSIC,
+		{ 0xfa, 0xfb, 0xfc, 0xfd, },
+		{ { 4, 0xababab83 } },
+		.fill_helper = bpf_fill_maxinsns13,
+	},
 	{
 		"BPF_MAXINSNS: ld_abs+get_processor_id",
 		{ },
-- 
cgit v1.2.3


From 3fe2867cdf088ffb2dc5aed6cdcf757b4c62476c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:33 +0200
Subject: bpf: fixup error message from gpl helpers on license mismatch

Stating 'proprietary program' in the error is just silly since it
can also be a different open source license than that which is just
not compatible.

Reference: https://twitter.com/majek04/status/998531268039102465
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1fd9667b29f1..4f4786ea2296 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2462,7 +2462,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose(env, "cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 4316b40914ecde3738968225af56e650e8b61938 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:34 +0200
Subject: bpf: show prog and map id in fdinfo

Its trivial and straight forward to expose it for scripts that can
then use it along with bpftool in order to inspect an individual
application's used maps and progs. Right now we dump some basic
information in the fdinfo file but with the help of the map/prog
id full introspection becomes possible now.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7365d79ae00d..0fa20624707f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -327,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "value_size:\t%u\n"
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
-		   "memlock:\t%llu\n",
+		   "memlock:\t%llu\n"
+		   "map_id:\t%u\n",
 		   map->map_type,
 		   map->key_size,
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
-		   map->pages * 1ULL << PAGE_SHIFT);
+		   map->pages * 1ULL << PAGE_SHIFT,
+		   map->id);
 
 	if (owner_prog_type) {
 		seq_printf(m, "owner_prog_type:\t%u\n",
@@ -1070,11 +1072,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
-		   "memlock:\t%llu\n",
+		   "memlock:\t%llu\n"
+		   "prog_id:\t%u\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
-		   prog->pages * 1ULL << PAGE_SHIFT);
+		   prog->pages * 1ULL << PAGE_SHIFT,
+		   prog->aux->id);
 }
 #endif
 
-- 
cgit v1.2.3


From 09772d92cd5ad998b0d5f6f46cd1658f8cb698cf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:35 +0200
Subject: bpf: avoid retpoline for lookup/update/delete calls on maps

While some of the BPF map lookup helpers provide a ->map_gen_lookup()
callback for inlining the map lookup altogether it is not available
for every map, so the remaining ones have to call bpf_map_lookup_elem()
helper which does a dispatch to map->ops->map_lookup_elem(). In
times of retpolines, this will control and trap speculative execution
rather than letting it do its work for the indirect call and will
therefore cause a slowdown. Likewise, bpf_map_update_elem() and
bpf_map_delete_elem() do not have an inlined version and need to call
into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
handlers.

Before:

  # bpftool prog dump xlated id 1
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (7a) *(u64 *)(r2 +0) = 0
    3: (18) r1 = map[id:1]
    5: (85) call __htab_map_lookup_elem#232656
    6: (15) if r0 == 0x0 goto pc+4
    7: (71) r1 = *(u8 *)(r0 +35)
    8: (55) if r1 != 0x0 goto pc+1
    9: (72) *(u8 *)(r0 +35) = 1
   10: (07) r0 += 56
   11: (15) if r0 == 0x0 goto pc+4
   12: (bf) r2 = r0
   13: (18) r1 = map[id:1]
   15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
   16: (95) exit                                 helper

After:

  # bpftool prog dump xlated id 1
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (7a) *(u64 *)(r2 +0) = 0
    3: (18) r1 = map[id:1]
    5: (85) call __htab_map_lookup_elem#233328
    6: (15) if r0 == 0x0 goto pc+4
    7: (71) r1 = *(u8 *)(r0 +35)
    8: (55) if r1 != 0x0 goto pc+1
    9: (72) *(u8 *)(r0 +35) = 1
   10: (07) r0 += 56
   11: (15) if r0 == 0x0 goto pc+4
   12: (bf) r2 = r0
   13: (18) r1 = map[id:1]
   15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
   16: (95) exit

In all three lookup/update/delete cases however we can use the actual
address of the map callback directly if we find that there's only a
single path with a map pointer leading to the helper call, meaning
when the map pointer has not been poisoned from verifier side.
Example code can be seen above for the delete case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/hashtab.c   | 12 ++++++---
 kernel/bpf/verifier.c  | 68 ++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6fd375fe7079..8e60f1e9a702 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -301,6 +301,9 @@ struct xdp_buff;
 
 /* Function call */
 
+#define BPF_CAST_CALL(x)					\
+		((u64 (*)(u64, u64, u64, u64, u64))(x))
+
 #define BPF_EMIT_CALL(FUNC)					\
 	((struct bpf_insn) {					\
 		.code  = BPF_JMP | BPF_CALL,			\
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b76828f23b49..3ca2198a6d22 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
@@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 	const int ret = BPF_REG_0;
 	const int ref_reg = BPF_REG_1;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 			      offsetof(struct htab_elem, lru_node) +
@@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4f4786ea2296..1dd6d5a7aa5b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2421,8 +2421,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
 
 	if (func_id != BPF_FUNC_tail_call &&
-	    func_id != BPF_FUNC_map_lookup_elem)
+	    func_id != BPF_FUNC_map_lookup_elem &&
+	    func_id != BPF_FUNC_map_update_elem &&
+	    func_id != BPF_FUNC_map_delete_elem)
 		return 0;
+
 	if (meta->map_ptr == NULL) {
 		verbose(env, "kernel subsystem misconfigured verifier\n");
 		return -EINVAL;
@@ -5586,6 +5589,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
+	const struct bpf_map_ops *ops;
 	struct bpf_insn_aux_data *aux;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
@@ -5715,35 +5719,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		}
 
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
-		 * handlers are currently limited to 64 bit only.
+		 * and other inlining handlers are currently limited to 64 bit
+		 * only.
 		 */
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
-		    insn->imm == BPF_FUNC_map_lookup_elem) {
+		    (insn->imm == BPF_FUNC_map_lookup_elem ||
+		     insn->imm == BPF_FUNC_map_update_elem ||
+		     insn->imm == BPF_FUNC_map_delete_elem)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
 
 			map_ptr = BPF_MAP_PTR(aux->map_state);
-			if (!map_ptr->ops->map_gen_lookup)
-				goto patch_call_imm;
+			ops = map_ptr->ops;
+			if (insn->imm == BPF_FUNC_map_lookup_elem &&
+			    ops->map_gen_lookup) {
+				cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+				if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+					verbose(env, "bpf verifier is misconfigured\n");
+					return -EINVAL;
+				}
 
-			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
-			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose(env, "bpf verifier is misconfigured\n");
-				return -EINVAL;
-			}
+				new_prog = bpf_patch_insn_data(env, i + delta,
+							       insn_buf, cnt);
+				if (!new_prog)
+					return -ENOMEM;
 
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
-						       cnt);
-			if (!new_prog)
-				return -ENOMEM;
+				delta    += cnt - 1;
+				env->prog = prog = new_prog;
+				insn      = new_prog->insnsi + i + delta;
+				continue;
+			}
 
-			delta += cnt - 1;
+			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+				     (void *(*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+				     (int (*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+				     (int (*)(struct bpf_map *map, void *key, void *value,
+					      u64 flags))NULL));
+			switch (insn->imm) {
+			case BPF_FUNC_map_lookup_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_update_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_delete_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
+					    __bpf_call_base;
+				continue;
+			}
 
-			/* keep walking new program and skip insns we just inserted */
-			env->prog = prog = new_prog;
-			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto patch_call_imm;
 		}
 
 		if (insn->imm == BPF_FUNC_redirect_map) {
-- 
cgit v1.2.3


From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:36 +0200
Subject: bpf: add bpf_skb_cgroup_id helper

Add a new bpf_skb_cgroup_id() helper that allows to retrieve the
cgroup id from the skb's socket. This is useful in particular to
enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in
cgroup v2 by allowing ID based matching on egress. This can in
particular be used in combination with applying policy e.g. from
map lookups, and also complements the older bpf_skb_under_cgroup()
interface. In user space the cgroup id for a given path can be
retrieved through the f_handle as demonstrated in [0] recently.

  [0] https://lkml.org/lkml/2018/5/22/1190

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 19 ++++++++++++++++++-
 net/core/filter.c        | 29 +++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 64ac0f7a689e..661318141f72 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2054,6 +2054,22 @@ union bpf_attr {
  *
  *	Return
  *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2134,7 +2150,8 @@ union bpf_attr {
 	FN(lwt_seg6_adjust_srh),	\
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
-	FN(rc_keydown),
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 885fb0e2f264..edbfaa613b6d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3661,6 +3661,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+	.func           = bpf_skb_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+#endif
+
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 				  unsigned long off, unsigned long len)
 {
@@ -4747,12 +4768,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 #ifdef CONFIG_XFRM
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
-	case BPF_FUNC_fib_lookup:
-		return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	case BPF_FUNC_skb_cgroup_id:
+		return &bpf_skb_cgroup_id_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:37 +0200
Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch

Since the remaining bits are not filled in struct bpf_tunnel_key
resp. struct bpf_xfrm_state and originate from uninitialized stack
space, we should make sure to clear them before handing control
back to the program.

Also add a padding element to struct bpf_xfrm_state for future use
similar as we have in struct bpf_tunnel_key and clear it as well.

  struct bpf_xfrm_state {
      __u32                      reqid;            /*     0     4 */
      __u32                      spi;              /*     4     4 */
      __u16                      family;           /*     8     2 */

      /* XXX 2 bytes hole, try to pack */

      union {
          __u32              remote_ipv4;          /*           4 */
          __u32              remote_ipv6[4];       /*          16 */
      };                                           /*    12    16 */

      /* size: 28, cachelines: 1, members: 4 */
      /* sum members: 26, holes: 1, sum holes: 2 */
      /* last cacheline: 28 bytes */
  };

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 3 ++-
 net/core/filter.c        | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 661318141f72..f0b6608b1f1c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,7 +2268,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;
+	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
 };
 
@@ -2279,6 +2279,7 @@ struct bpf_xfrm_state {
 	__u32 reqid;
 	__u32 spi;	/* Stored in network byte order */
 	__u16 family;
+	__u16 ext;	/* Padding, future use. */
 	union {
 		__u32 remote_ipv4;	/* Stored in network byte order */
 		__u32 remote_ipv6[4];	/* Stored in network byte order */
diff --git a/net/core/filter.c b/net/core/filter.c
index edbfaa613b6d..28e864777c0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3445,6 +3445,7 @@ set_compat:
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
+	to->tunnel_ext = 0;
 
 	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3452,6 +3453,8 @@ set_compat:
 		to->tunnel_label = be32_to_cpu(info->key.label);
 	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+		to->tunnel_label = 0;
 	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
 	to->reqid = x->props.reqid;
 	to->spi = x->id.spi;
 	to->family = x->props.family;
+	to->ext = 0;
+
 	if (to->family == AF_INET6) {
 		memcpy(to->remote_ipv6, x->props.saddr.a6,
 		       sizeof(to->remote_ipv6));
 	} else {
 		to->remote_ipv4 = x->props.saddr.a4;
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From b3bbba3570a7ba064f12c33f484eb9757a2b359e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:38 +0200
Subject: bpf: fix cbpf parser bug for octal numbers

Range is 0-7, not 0-9, otherwise parser silently excludes it from the
strtol() rather than throwing an error.

Reported-by: Marc Boschma <marc@boschma.cx>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpf_exp.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/bpf_exp.l b/tools/bpf/bpf_exp.l
index bd83149e7be0..4da8d053d68f 100644
--- a/tools/bpf/bpf_exp.l
+++ b/tools/bpf/bpf_exp.l
@@ -175,7 +175,7 @@ extern void yyerror(const char *str);
 			yylval.number = strtol(yytext, NULL, 10);
 			return number;
 		}
-([0][0-9]+)	{
+([0][0-7]+)	{
 			yylval.number = strtol(yytext + 1, NULL, 8);
 			return number;
 		}
-- 
cgit v1.2.3


From bc23105ca0abdeed98366af01c700c2c3aff5cd5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:39 +0200
Subject: bpf: fix context access in tracing progs on 32 bit archs

Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
program type in test_verifier report the following errors on x86_32:

  172/p unpriv: spill/fill of different pointers ldx FAIL
  Unexpected error message!
  0: (bf) r6 = r10
  1: (07) r6 += -8
  2: (15) if r1 == 0x0 goto pc+3
  R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
  3: (bf) r2 = r10
  4: (07) r2 += -76
  5: (7b) *(u64 *)(r6 +0) = r2
  6: (55) if r1 != 0x0 goto pc+1
  R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp
  7: (7b) *(u64 *)(r6 +0) = r1
  8: (79) r1 = *(u64 *)(r6 +0)
  9: (79) r1 = *(u64 *)(r1 +68)
  invalid bpf_context access off=68 size=8

  378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (71) r0 = *(u8 *)(r1 +68)
  invalid bpf_context access off=68 size=1

  379/p check bpf_perf_event_data->sample_period half load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (69) r0 = *(u16 *)(r1 +68)
  invalid bpf_context access off=68 size=2

  380/p check bpf_perf_event_data->sample_period word load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (61) r0 = *(u32 *)(r1 +68)
  invalid bpf_context access off=68 size=4

  381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (79) r0 = *(u64 *)(r1 +68)
  invalid bpf_context access off=68 size=8

Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok()
will then bail out saying that off & (size_default - 1) which is 68 & 7
doesn't cleanly align in the case of sample_period access from struct
bpf_perf_event_data, hence verifier wrongly thinks we might be doing an
unaligned access here though underlying arch can handle it just fine.
Therefore adjust this down to machine size and check and rewrite the
offset for narrow access on that basis. We also need to fix corresponding
pe_prog_is_valid_access(), since we hit the check for off % size != 0
(e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs
for tracing work on x86_32.

Reported-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   | 30 ++++++++++++++++++++++++------
 kernel/bpf/verifier.c    |  3 ++-
 kernel/trace/bpf_trace.c | 10 ++++++++--
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8e60f1e9a702..45fc0f5000d8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 	return prog->type == BPF_PROG_TYPE_UNSPEC;
 }
 
-static inline bool
-bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
+static inline u32 bpf_ctx_off_adjust_machine(u32 size)
+{
+	const u32 size_machine = sizeof(unsigned long);
+
+	if (size > size_machine && size % size_machine == 0)
+		size = size_machine;
+
+	return size;
+}
+
+static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
+					   u32 size_default)
 {
-	bool off_ok;
+	size_default = bpf_ctx_off_adjust_machine(size_default);
+	size_access  = bpf_ctx_off_adjust_machine(size_access);
+
 #ifdef __LITTLE_ENDIAN
-	off_ok = (off & (size_default - 1)) == 0;
+	return (off & (size_default - 1)) == 0;
 #else
-	off_ok = (off & (size_default - 1)) + size == size_default;
+	return (off & (size_default - 1)) + size_access == size_default;
 #endif
-	return off_ok && size <= size_default && (size & (size - 1)) == 0;
+}
+
+static inline bool
+bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
+{
+	return bpf_ctx_narrow_align_ok(off, size, size_default) &&
+	       size <= size_default && (size & (size - 1)) == 0;
 }
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1dd6d5a7aa5b..d6403b5166f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5349,6 +5349,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 */
 		is_narrower_load = size < ctx_field_size;
 		if (is_narrower_load) {
+			u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
 			u32 off = insn->off;
 			u8 size_code;
 
@@ -5363,7 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			else if (ctx_field_size == 8)
 				size_code = BPF_DW;
 
-			insn->off = off & ~(ctx_field_size - 1);
+			insn->off = off & ~(size_default - 1);
 			insn->code = BPF_LDX | BPF_MEM | size_code;
 		}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af1486d9a0ed..752992ce3513 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -880,8 +880,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 		return false;
 	if (type != BPF_READ)
 		return false;
-	if (off % size != 0)
-		return false;
+	if (off % size != 0) {
+		if (sizeof(unsigned long) != 4)
+			return false;
+		if (size != 8)
+			return false;
+		if (off % size != 4)
+			return false;
+	}
 
 	switch (off) {
 	case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
-- 
cgit v1.2.3


From 6b6a192595a9f04b2196620ee7b6027e567789ac Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:40 +0200
Subject: bpf: sync bpf uapi header with tools

Pull in recent changes from include/uapi/linux/bpf.h.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 64ac0f7a689e..f0b6608b1f1c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2054,6 +2054,22 @@ union bpf_attr {
  *
  *	Return
  *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2134,7 +2150,8 @@ union bpf_attr {
 	FN(lwt_seg6_adjust_srh),	\
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
-	FN(rc_keydown),
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2251,7 +2268,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;
+	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
 };
 
@@ -2262,6 +2279,7 @@ struct bpf_xfrm_state {
 	__u32 reqid;
 	__u32 spi;	/* Stored in network byte order */
 	__u16 family;
+	__u16 ext;	/* Padding, future use. */
 	union {
 		__u32 remote_ipv4;	/* Stored in network byte order */
 		__u32 remote_ipv6[4];	/* Stored in network byte order */
-- 
cgit v1.2.3


From 10a76564ae865cbf30ed30e8cbdc1a047e0559ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:41 +0200
Subject: bpf, doc: add missing patchwork url and libbpf to maintainers

Add missing bits under tools/lib/bpf/ and also Q: entry in order to
make it easier for people to retrieve current patch queue.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f492431b239b..2fd51db09fa4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2722,6 +2722,7 @@ L:	netdev@vger.kernel.org
 L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
+Q:	https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
 S:	Supported
 F:	arch/x86/net/bpf_jit*
 F:	Documentation/networking/filter.txt
@@ -2740,6 +2741,7 @@ F:	net/sched/act_bpf.c
 F:	net/sched/cls_bpf.c
 F:	samples/bpf/
 F:	tools/bpf/
+F:	tools/lib/bpf/
 F:	tools/testing/selftests/bpf/
 
 BROADCOM B44 10/100 ETHERNET DRIVER
-- 
cgit v1.2.3


From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:47 +0200
Subject: xdp: add flags argument to ndo_xdp_xmit API

This patch only change the API and reject any use of flags. This is an
intermediate step that allows us to implement the flush flag operation
later, for each individual driver in a separate patch.

The plan is to implement flush operation via XDP_XMIT_FLUSH flag
and then remove XDP_XMIT_FLAGS_NONE when done.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 6 +++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 ++++-
 drivers/net/tun.c                             | 8 ++++++--
 drivers/net/virtio_net.c                      | 5 ++++-
 include/linux/netdevice.h                     | 7 ++++---
 include/net/xdp.h                             | 5 +++++
 kernel/bpf/devmap.c                           | 2 +-
 net/core/filter.c                             | 2 +-
 9 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..c0451d6e0790 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * For error cases, a negative errno code is returned and no-frames
  * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		int err;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index eb8804b3d7b6..820f76db251b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 031d65c4178d..87f088f4af52 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 }
 
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
-			  struct xdp_frame **frames)
+			  struct xdp_frame **frames, u32 flags)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	/* During program transitions its possible adapter->xdp_prog is assigned
 	 * but ring has not been configured yet. In this case simply abort xmit.
 	 */
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2265d2ccea47..b182b8cdd219 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+static int tun_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
@@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames
 	int cnt = n;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, 1, &frame);
+	return tun_xdp_xmit(dev, 1, &frame, 0);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..4ed823625953 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -468,7 +468,7 @@ static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev,
-			    int n, struct xdp_frame **frames)
+			    int n, struct xdp_frame **frames, u32 flags)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
@@ -481,6 +481,9 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..7f17785a59d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,13 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
+ *			u32 flags);
  *	This function is used to submit @n XDP packets for transmit on a
  *	netdevice. Returns number of frames successfully transmitted, frames
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1380,7 +1380,8 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
-						struct xdp_frame **xdp);
+						struct xdp_frame **xdp,
+						u32 flags);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 7ad779237ae8..0c45f0f943ed 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -40,6 +40,11 @@ enum xdp_mem_type {
 	MEM_TYPE_MAX,
 };
 
+/* XDP flags for ndo_xdp_xmit */
+#define XDP_XMIT_FLAGS_NONE	0U
+#define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
+#define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
+
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
 	u32 id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1fe3fe60508a..037e234056f7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 28e864777c0f..56e40dafdde7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,7 +3056,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
 	if (sent <= 0)
 		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);
-- 
cgit v1.2.3


From cdb57ed07fafb79a250e62d714b8910f2d341ef2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:52 +0200
Subject: i40e: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag i40e_xdp_xmit now performs the
same kind of ring tail update as in i40e_xdp_flush.  The advantage is
that all the necessary checks have been performed and xdp_ring can be
updated, instead of having to perform the exact same steps/checks in
i40e_xdp_flush

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c0451d6e0790..5f01e4ce9c92 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3676,6 +3676,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_ring *xdp_ring;
 	int drops = 0;
 	int i;
 
@@ -3685,20 +3686,25 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
+	xdp_ring = vsi->xdp_rings[queue_index];
+
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		int err;
 
-		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+		err = i40e_xmit_xdp_ring(xdpf, xdp_ring);
 		if (err != I40E_XDP_TX) {
 			xdp_return_frame_rx_napi(xdpf);
 			drops++;
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		i40e_xdp_ring_update_tail(xdp_ring);
+
 	return n - drops;
 }
 
-- 
cgit v1.2.3


From 5e2e609522d67750d4b3c0321d1917bb109ac41b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:57 +0200
Subject: ixgbe: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag ixgbe_xdp_xmit now performs the
same kind of ring tail update as in ixgbe_xdp_flush.  The update tail
code in ixgbe_xdp_flush is generalized and shared with ixgbe_xdp_xmit.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 87f088f4af52..4fd77c9067f2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10022,6 +10022,15 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static void ixgbe_xdp_ring_update_tail(struct ixgbe_ring *ring)
+{
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 */
+	wmb();
+	writel(ring->next_to_use, ring->tail);
+}
+
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 			  struct xdp_frame **frames, u32 flags)
 {
@@ -10033,7 +10042,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	/* During program transitions its possible adapter->xdp_prog is assigned
@@ -10054,6 +10063,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		ixgbe_xdp_ring_update_tail(ring);
+
 	return n - drops;
 }
 
@@ -10072,11 +10084,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
 	if (unlikely(!ring))
 		return;
 
-	/* Force memory writes to complete before letting h/w know there
-	 * are new descriptors to fetch.
-	 */
-	wmb();
-	writel(ring->next_to_use, ring->tail);
+	ixgbe_xdp_ring_update_tail(ring);
 
 	return;
 }
-- 
cgit v1.2.3


From 0c9d917b7d74b758af1b4e37996af48d95b2ef33 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:03 +0200
Subject: tun: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag tun_xdp_xmit now performs the same
kind of socket wake up as in tun_xdp_flush(). The wake up code from
tun_xdp_flush is generalized and shared with tun_xdp_xmit.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/tun.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b182b8cdd219..d82a05fb0594 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,6 +1285,14 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
+static void __tun_xdp_flush_tfile(struct tun_file *tfile)
+{
+	/* Notify and wake up reader process */
+	if (tfile->flags & TUN_FASYNC)
+		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
+}
+
 static int tun_xdp_xmit(struct net_device *dev, int n,
 			struct xdp_frame **frames, u32 flags)
 {
@@ -1295,7 +1303,7 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
 	int cnt = n;
 	int i;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	rcu_read_lock();
@@ -1325,6 +1333,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
 	}
 	spin_unlock(&tfile->tx_ring.producer_lock);
 
+	if (flags & XDP_XMIT_FLUSH)
+		__tun_xdp_flush_tfile(tfile);
+
 	rcu_read_unlock();
 	return cnt - drops;
 }
@@ -1353,11 +1364,7 @@ static void tun_xdp_flush(struct net_device *dev)
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Notify and wake up reader process */
-	if (tfile->flags & TUN_FASYNC)
-		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
-	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
-
+	__tun_xdp_flush_tfile(tfile);
 out:
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From 5d274cb4bbae481161737a1f652e276883d3d970 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:08 +0200
Subject: virtio_net: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag virtnet_xdp_xmit now performs the
same virtqueue_kick as virtnet_xdp_flush.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/virtio_net.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ed823625953..62ba8aadd8e6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -481,7 +481,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
@@ -507,6 +507,10 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 			drops++;
 		}
 	}
+
+	if (flags & XDP_XMIT_FLUSH)
+		virtqueue_kick(sq->vq);
+
 	return n - drops;
 }
 
-- 
cgit v1.2.3


From 73de5717eb30ceedef6abf9da4bc2194f25d92ca Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:13 +0200
Subject: xdp: done implementing ndo_xdp_xmit flush flag for all drivers

Removing XDP_XMIT_FLAGS_NONE as all driver now implement
a flush operation in their ndo_xdp_xmit call.  The compiler
will catch if any users of XDP_XMIT_FLAGS_NONE remains.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0c45f0f943ed..a3b71a4dd71d 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,7 +41,6 @@ enum xdp_mem_type {
 };
 
 /* XDP flags for ndo_xdp_xmit */
-#define XDP_XMIT_FLAGS_NONE	0U
 #define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
 #define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
 
-- 
cgit v1.2.3


From 1e67575a5840908e33502b210a22509fe5d6ca53 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:18 +0200
Subject: bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush

This is the first real user of the XDP_XMIT_FLUSH flag.

As pointed out many times, XDP_REDIRECT without using BPF maps is
significant slower than the map variant.  This is primary due to the
lack of bulking, as the ndo_xdp_flush operation is required after each
frame (to avoid frames hanging on the egress device).

It is still possible to optimize this case.  Instead of invoking two
NDO indirect calls, which are very expensive with CONFIG_RETPOLINE,
instead instruct ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 56e40dafdde7..a72ea9f61010 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,10 +3056,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
 	if (sent <= 0)
 		return sent;
-	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
 }
 
-- 
cgit v1.2.3


From c1ece6b245bd12a57124da78abafbf8a511394d6 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:23 +0200
Subject: bpf/xdp: devmap can avoid calling ndo_xdp_flush

The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead
instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in
appropriate places.

Notice after this patch it is possible to remove ndo_xdp_flush
completely, as this is the last user of ndo_xdp_flush. This is left
for later patches, to keep driver changes separate.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 037e234056f7..a7cc7b3494a9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 }
 
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
-			 struct xdp_bulk_queue *bq)
+		       struct xdp_bulk_queue *bq, u32 flags)
 {
 	struct net_device *dev = obj->dev;
 	int sent = 0, drops = 0, err = 0;
@@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
@@ -276,7 +276,6 @@ void __dev_map_flush(struct bpf_map *map)
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
 		struct xdp_bulk_queue *bq;
-		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
 		 * between xdp redirect and flush op.
@@ -287,10 +286,7 @@ void __dev_map_flush(struct bpf_map *map)
 		__clear_bit(bit, bitmap);
 
 		bq = this_cpu_ptr(dev->bulkq);
-		bq_xmit_all(dev, bq);
-		netdev = dev->dev;
-		if (likely(netdev->netdev_ops->ndo_xdp_flush))
-			netdev->netdev_ops->ndo_xdp_flush(netdev);
+		bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
 	}
 }
 
@@ -320,7 +316,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-		bq_xmit_all(obj, bq);
+		bq_xmit_all(obj, bq, 0);
 
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -359,8 +355,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
-	if (dev->dev->netdev_ops->ndo_xdp_flush) {
-		struct net_device *fl = dev->dev;
+	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
 		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
 
@@ -371,9 +366,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 			__clear_bit(dev->bit, bitmap);
 
 			bq = per_cpu_ptr(dev->bulkq, cpu);
-			bq_xmit_all(dev, bq);
-
-			fl->netdev_ops->ndo_xdp_flush(dev->dev);
+			bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
 		}
 	}
 }
-- 
cgit v1.2.3


From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:41 -0700
Subject: bpf: implement bpf_get_current_cgroup_id() helper

bpf has been used extensively for tracing. For example, bcc
contains an almost full set of bpf-based tools to trace kernel
and user functions/events. Most tracing tools are currently
either filtered based on pid or system-wide.

Containers have been used quite extensively in industry and
cgroup is often used together to provide resource isolation
and protection. Several processes may run inside the same
container. It is often desirable to get container-level tracing
results as well, e.g. syscall count, function count, I/O
activity, etc.

This patch implements a new helper, bpf_get_current_cgroup_id(),
which will return cgroup id based on the cgroup within which
the current task is running.

The later patch will provide an example to show that
userspace can get the same cgroup id so it could
configure a filter or policy in the bpf program based on
task cgroup id.

The helper is currently implemented for tracing. It can
be added to other program types as well when needed.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  8 +++++++-
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 15 +++++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 5 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbe297436e5d..995c3b1e59bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
+extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f0b6608b1f1c..18712b0dbfe7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2070,6 +2070,11 @@ union bpf_attr {
  * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2151,7 +2156,8 @@ union bpf_attr {
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
-	FN(skb_cgroup_id),
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 527587de8a67..9f1493705f40 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+
+	return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+	.func		= bpf_get_current_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+#endif
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 752992ce3513..e2ab5b7f29d2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From c7ddbbaf1e73e2f4e95cfda7b1da1be7fd199a4d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:42 -0700
Subject: tools/bpf: sync uapi bpf.h for bpf_get_current_cgroup_id() helper

Sync kernel uapi/linux/bpf.h with tools uapi/linux/bpf.h.
Also add the necessary helper define in bpf_helpers.h.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h            | 8 +++++++-
 tools/testing/selftests/bpf/bpf_helpers.h | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f0b6608b1f1c..18712b0dbfe7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2070,6 +2070,11 @@ union bpf_attr {
  * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2151,7 +2156,8 @@ union bpf_attr {
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
-	FN(skb_cgroup_id),
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index a66a9d91acf4..f2f28b6c8915 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -131,6 +131,8 @@ static int (*bpf_rc_repeat)(void *ctx) =
 static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol,
 			     unsigned long long scancode, unsigned int toggle) =
 	(void *) BPF_FUNC_rc_keydown;
+static unsigned long long (*bpf_get_current_cgroup_id)(void) =
+	(void *) BPF_FUNC_get_current_cgroup_id;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From f269099a7e7a0c6732c4a817d0e99e92216414d9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:43 -0700
Subject: tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper

Syscall name_to_handle_at() can be used to get cgroup id
for a particular cgroup path in user space. The selftest
got cgroup id from both user and kernel, and compare to
ensure they are equal to each other.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore           |   1 +
 tools/testing/selftests/bpf/Makefile             |   6 +-
 tools/testing/selftests/bpf/cgroup_helpers.c     |  57 +++++++++
 tools/testing/selftests/bpf/cgroup_helpers.h     |   1 +
 tools/testing/selftests/bpf/get_cgroup_id_kern.c |  28 +++++
 tools/testing/selftests/bpf/get_cgroup_id_user.c | 141 +++++++++++++++++++++++
 6 files changed, 232 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/get_cgroup_id_kern.c
 create mode 100644 tools/testing/selftests/bpf/get_cgroup_id_user.c

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 6ea835982464..49938d72cf63 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -18,3 +18,4 @@ urandom_read
 test_btf
 test_sockmap
 test_lirc_mode2_user
+get_cgroup_id_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 553d1816b77a..607ed8729c06 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_btf test_sockmap test_lirc_mode2_user
+	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -34,7 +34,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
+	get_cgroup_id_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -63,6 +64,7 @@ $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 $(OUTPUT)/test_sockmap: cgroup_helpers.c
 $(OUTPUT)/test_progs: trace_helpers.c
+$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
index f3bca3ade0f3..c87b4e052ce9 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -6,6 +6,7 @@
 #include <sys/types.h>
 #include <linux/limits.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <linux/sched.h>
 #include <fcntl.h>
 #include <unistd.h>
@@ -176,3 +177,59 @@ int create_and_get_cgroup(char *path)
 
 	return fd;
 }
+
+/**
+ * get_cgroup_id() - Get cgroup id for a particular cgroup path
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * On success, it returns the cgroup id. On failure it returns 0,
+ * which is an invalid cgroup id.
+ * If there is a failure, it prints the error to stderr.
+ */
+unsigned long long get_cgroup_id(char *path)
+{
+	int dirfd, err, flags, mount_id, fhsize;
+	union {
+		unsigned long long cgid;
+		unsigned char raw_bytes[8];
+	} id;
+	char cgroup_workdir[PATH_MAX + 1];
+	struct file_handle *fhp, *fhp2;
+	unsigned long long ret = 0;
+
+	format_cgroup_path(cgroup_workdir, path);
+
+	dirfd = AT_FDCWD;
+	flags = 0;
+	fhsize = sizeof(*fhp);
+	fhp = calloc(1, fhsize);
+	if (!fhp) {
+		log_err("calloc");
+		return 0;
+	}
+	err = name_to_handle_at(dirfd, cgroup_workdir, fhp, &mount_id, flags);
+	if (err >= 0 || fhp->handle_bytes != 8) {
+		log_err("name_to_handle_at");
+		goto free_mem;
+	}
+
+	fhsize = sizeof(struct file_handle) + fhp->handle_bytes;
+	fhp2 = realloc(fhp, fhsize);
+	if (!fhp2) {
+		log_err("realloc");
+		goto free_mem;
+	}
+	err = name_to_handle_at(dirfd, cgroup_workdir, fhp2, &mount_id, flags);
+	fhp = fhp2;
+	if (err < 0) {
+		log_err("name_to_handle_at");
+		goto free_mem;
+	}
+
+	memcpy(id.raw_bytes, fhp->f_handle, 8);
+	ret = id.cgid;
+
+free_mem:
+	free(fhp);
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h
index 06485e0002b3..20a4a5dcd469 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.h
+++ b/tools/testing/selftests/bpf/cgroup_helpers.h
@@ -13,5 +13,6 @@ int create_and_get_cgroup(char *path);
 int join_cgroup(char *path);
 int setup_cgroup_environment(void);
 void cleanup_cgroup_environment(void);
+unsigned long long get_cgroup_id(char *path);
 
 #endif
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
new file mode 100644
index 000000000000..2cf8cb23f209
--- /dev/null
+++ b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") cg_ids = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64),
+	.max_entries = 1,
+};
+
+SEC("tracepoint/syscalls/sys_enter_nanosleep")
+int trace(void *ctx)
+{
+	__u32 key = 0;
+	__u64 *val;
+
+	val = bpf_map_lookup_elem(&cg_ids, &key);
+	if (val)
+		*val = bpf_get_current_cgroup_id();
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
new file mode 100644
index 000000000000..ea19a42e5894
--- /dev/null
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+
+#define CHECK(condition, tag, format...) ({		\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("%s:FAIL:%s ", __func__, tag);	\
+		printf(format);				\
+	} else {					\
+		printf("%s:PASS:%s\n", __func__, tag);	\
+	}						\
+	__ret;						\
+})
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map)
+		return -1;
+	return bpf_map__fd(map);
+}
+
+#define TEST_CGROUP "/test-bpf-get-cgroup-id/"
+
+int main(int argc, char **argv)
+{
+	const char *probe_name = "syscalls/sys_enter_nanosleep";
+	const char *file = "get_cgroup_id_kern.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	int cgroup_fd, cgidmap_fd;
+	struct bpf_object *obj;
+	__u64 kcgid = 0, ucgid;
+	int exit_code = 1;
+	char buf[256];
+	__u32 key = 0;
+
+	err = setup_cgroup_environment();
+	if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err,
+		  errno))
+		return 1;
+
+	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
+	if (CHECK(cgroup_fd < 0, "create_and_get_cgroup", "err %d errno %d\n",
+		  cgroup_fd, errno))
+		goto cleanup_cgroup_env;
+
+	err = join_cgroup(TEST_CGROUP);
+	if (CHECK(err, "join_cgroup", "err %d errno %d\n", err, errno))
+		goto cleanup_cgroup_env;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto cleanup_cgroup_env;
+
+	cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids");
+	if (CHECK(cgidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  cgidmap_fd, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	/* attach to this pid so the all bpf invocations will be in the
+	 * cgroup associated with this pid.
+	 */
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* trigger some syscalls */
+	sleep(1);
+
+	err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid);
+	if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	ucgid = get_cgroup_id(TEST_CGROUP);
+	if (CHECK(kcgid != ucgid, "compare_cgroup_id",
+		  "kern cgid %llx user cgid %llx", kcgid, ucgid))
+		goto close_pmu;
+
+	exit_code = 0;
+	printf("%s:PASS\n", argv[0]);
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	bpf_object__close(obj);
+cleanup_cgroup_env:
+	cleanup_cgroup_environment();
+	return exit_code;
+}
-- 
cgit v1.2.3


From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 3 Jun 2018 08:15:19 -0700
Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo

As Michal noted the flow struct takes both the flow label and priority.
Update the bpf_fib_lookup API to note that it is flowinfo and not just
the flow label.

Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h   | 2 +-
 net/core/filter.c          | 2 +-
 samples/bpf/xdp_fwd_kern.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 18712b0dbfe7..eeb6237be5c2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2629,7 +2629,7 @@ struct bpf_fib_lookup {
 	union {
 		/* inputs to lookup */
 		__u8	tos;		/* AF_INET  */
-		__be32	flowlabel;	/* AF_INET6 */
+		__be32	flowinfo;	/* AF_INET6, flow_label + priority */
 
 		/* output: metric of fib result (IPv4/IPv6 only) */
 		__u32	rt_metric;
diff --git a/net/core/filter.c b/net/core/filter.c
index a72ea9f61010..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4221,7 +4221,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		fl6.flowi6_oif = 0;
 		strict = RT6_LOOKUP_F_HAS_SADDR;
 	}
-	fl6.flowlabel = params->flowlabel;
+	fl6.flowlabel = params->flowinfo;
 	fl6.flowi6_scope = 0;
 	fl6.flowi6_flags = 0;
 	fl6.mp_hash = 0;
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 4a6be0f87505..6673cdb9f55c 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -88,7 +88,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 			return XDP_PASS;
 
 		fib_params.family	= AF_INET6;
-		fib_params.flowlabel	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+		fib_params.flowinfo	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
 		fib_params.l4_protocol	= ip6h->nexthdr;
 		fib_params.sport	= 0;
 		fib_params.dport	= 0;
-- 
cgit v1.2.3


From 4e64c835254095f55044d393e628dd3e92fca304 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:11 +0200
Subject: xsk: proper fill queue descriptor validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the fill queue descriptor was not copied to kernel space
prior validating it, making it possible for userland to change the
descriptor post-kernel-validation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c       | 11 +++++------
 net/xdp/xsk_queue.h | 32 +++++++++-----------------------
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cce0e4f8a536..43554eb56fe6 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,20 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 *id, len = xdp->data_end - xdp->data;
+	u32 id, len = xdp->data_end - xdp->data;
 	void *buffer;
-	int err = 0;
+	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	id = xskq_peek_id(xs->umem->fq);
-	if (!id)
+	if (!xskq_peek_id(xs->umem->fq, &id))
 		return -ENOSPC;
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
+	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, *id, len,
+	err = xskq_produce_batch_desc(xs->rx, id, len,
 				      xs->umem->frame_headroom);
 	if (!err)
 		xskq_discard_id(xs->umem->fq);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index cb8e5be35110..b5924e7aeb2b 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -85,14 +85,15 @@ static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q)
+static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		if (xskq_is_valid_id(q, ring->desc[idx]))
-			return &ring->desc[idx];
+		*id = READ_ONCE(ring->desc[idx]);
+		if (xskq_is_valid_id(q, *id))
+			return id;
 
 		q->cons_tail++;
 	}
@@ -100,28 +101,22 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q)
+static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 {
-	struct xdp_umem_ring *ring;
-
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
 		smp_rmb();
-
-		return xskq_validate_id(q);
 	}
 
-	ring = (struct xdp_umem_ring *)q->ring;
-	return &ring->desc[q->cons_tail & q->ring_mask];
+	return xskq_validate_id(q, id);
 }
 
 static inline void xskq_discard_id(struct xsk_queue *q)
 {
 	q->cons_tail++;
-	(void)xskq_validate_id(q);
 }
 
 static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
@@ -174,11 +169,9 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		if (xskq_is_valid_desc(q, &ring->desc[idx])) {
-			if (desc)
-				*desc = ring->desc[idx];
+		*desc = READ_ONCE(ring->desc[idx]);
+		if (xskq_is_valid_desc(q, desc))
 			return desc;
-		}
 
 		q->cons_tail++;
 	}
@@ -189,27 +182,20 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
 					      struct xdp_desc *desc)
 {
-	struct xdp_rxtx_ring *ring;
-
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
 		smp_rmb();
-
-		return xskq_validate_desc(q, desc);
 	}
 
-	ring = (struct xdp_rxtx_ring *)q->ring;
-	*desc = ring->desc[q->cons_tail & q->ring_mask];
-	return desc;
+	return xskq_validate_desc(q, desc);
 }
 
 static inline void xskq_discard_desc(struct xsk_queue *q)
 {
 	q->cons_tail++;
-	(void)xskq_validate_desc(q, NULL);
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-- 
cgit v1.2.3


From a509a95536a86ef84deb16c656d741437791b414 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:12 +0200
Subject: xsk: proper Rx drop statistics update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, rx_dropped could be updated incorrectly, e.g. if the XDP
program redirected the frame to a socket bound to a different queue
than where the XDP program was executing.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 43554eb56fe6..966307ce4b8e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -48,8 +48,10 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id))
+	if (!xskq_peek_id(xs->umem->fq, &id)) {
+		xs->rx_dropped++;
 		return -ENOSPC;
+	}
 
 	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
 	memcpy(buffer, xdp->data, len);
@@ -57,6 +59,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 				      xs->umem->frame_headroom);
 	if (!err)
 		xskq_discard_id(xs->umem->fq);
+	else
+		xs->rx_dropped++;
 
 	return err;
 }
@@ -68,8 +72,6 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	err = __xsk_rcv(xs, xdp);
 	if (likely(!err))
 		xdp_return_buff(xdp);
-	else
-		xs->rx_dropped++;
 
 	return err;
 }
@@ -87,8 +89,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	err = __xsk_rcv(xs, xdp);
 	if (!err)
 		xsk_flush(xs);
-	else
-		xs->rx_dropped++;
 
 	return err;
 }
-- 
cgit v1.2.3


From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:13 +0200
Subject: xsk: new descriptor addressing scheme
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data.  Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).

By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.

In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.

In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.

We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.

To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.

On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.

Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.

This commit changes the uapi of AF_XDP sockets, and updates the
documentation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++---------------
 include/uapi/linux/if_xdp.h         |  12 ++---
 net/xdp/xdp_umem.c                  |  33 ++++++------
 net/xdp/xdp_umem.h                  |  27 +++-------
 net/xdp/xdp_umem_props.h            |   4 +-
 net/xdp/xsk.c                       |  30 ++++++-----
 net/xdp/xsk_queue.c                 |   2 +-
 net/xdp/xsk_queue.h                 |  43 +++++++--------
 8 files changed, 123 insertions(+), 129 deletions(-)

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 91928d9ee4bf..ff929cfab4f4 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -12,7 +12,7 @@ packet processing.
 
 This document assumes that the reader is familiar with BPF and XDP. If
 not, the Cilium project has an excellent reference guide at
-http://cilium.readthedocs.io/en/doc-1.0/bpf/.
+http://cilium.readthedocs.io/en/latest/bpf/.
 
 Using the XDP_REDIRECT action from an XDP program, the program can
 redirect ingress frames to other XDP enabled netdevs, using the
@@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points
 to that packet can be changed to point to another and reused right
 away. This again avoids copying data.
 
-The UMEM consists of a number of equally size frames and each frame
-has a unique frame id. A descriptor in one of the rings references a
-frame by referencing its frame id. The user space allocates memory for
-this UMEM using whatever means it feels is most appropriate (malloc,
-mmap, huge pages, etc). This memory area is then registered with the
-kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two
-rings: the FILL ring and the COMPLETION ring. The fill ring is used by
-the application to send down frame ids for the kernel to fill in with
-RX packet data. References to these frames will then appear in the RX
-ring once each packet has been received. The completion ring, on the
-other hand, contains frame ids that the kernel has transmitted
-completely and can now be used again by user space, for either TX or
-RX. Thus, the frame ids appearing in the completion ring are ids that
-were previously transmitted using the TX ring. In summary, the RX and
-FILL rings are used for the RX path and the TX and COMPLETION rings
-are used for the TX path.
+The UMEM consists of a number of equally sized chunks. A descriptor in
+one of the rings references a frame by referencing its addr. The addr
+is simply an offset within the entire UMEM region. The user space
+allocates memory for this UMEM using whatever means it feels is most
+appropriate (malloc, mmap, huge pages, etc). This memory area is then
+registered with the kernel using the new setsockopt XDP_UMEM_REG. The
+UMEM also has two rings: the FILL ring and the COMPLETION ring. The
+fill ring is used by the application to send down addr for the kernel
+to fill in with RX packet data. References to these frames will then
+appear in the RX ring once each packet has been received. The
+completion ring, on the other hand, contains frame addr that the
+kernel has transmitted completely and can now be used again by user
+space, for either TX or RX. Thus, the frame addrs appearing in the
+completion ring are addrs that were previously transmitted using the
+TX ring. In summary, the RX and FILL rings are used for the RX path
+and the TX and COMPLETION rings are used for the TX path.
 
 The socket is then finally bound with a bind() call to a device and a
 specific queue id on that device, and it is not until bind is
@@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its
 corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind
 call and submits the XSK of the process it would like to share UMEM
 with as well as its own newly created XSK socket. The new process will
-then receive frame id references in its own RX ring that point to this
-shared UMEM. Note that since the ring structures are single-consumer /
-single-producer (for performance reasons), the new process has to
-create its own socket with associated RX and TX rings, since it cannot
-share this with the other process. This is also the reason that there
-is only one set of FILL and COMPLETION rings per UMEM. It is the
-responsibility of a single process to handle the UMEM.
+then receive frame addr references in its own RX ring that point to
+this shared UMEM. Note that since the ring structures are
+single-consumer / single-producer (for performance reasons), the new
+process has to create its own socket with associated RX and TX rings,
+since it cannot share this with the other process. This is also the
+reason that there is only one set of FILL and COMPLETION rings per
+UMEM. It is the responsibility of a single process to handle the UMEM.
 
 How is then packets distributed from an XDP program to the XSKs? There
 is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The
@@ -102,10 +102,10 @@ UMEM
 
 UMEM is a region of virtual contiguous memory, divided into
 equal-sized frames. An UMEM is associated to a netdev and a specific
-queue id of that netdev. It is created and configured (frame size,
-frame headroom, start address and size) by using the XDP_UMEM_REG
-setsockopt system call. A UMEM is bound to a netdev and queue id, via
-the bind() system call.
+queue id of that netdev. It is created and configured (chunk size,
+headroom, start address and size) by using the XDP_UMEM_REG setsockopt
+system call. A UMEM is bound to a netdev and queue id, via the bind()
+system call.
 
 An AF_XDP is socket linked to a single UMEM, but one UMEM can have
 multiple AF_XDP sockets. To share an UMEM created via one socket A,
@@ -147,13 +147,17 @@ UMEM Fill Ring
 ~~~~~~~~~~~~~~
 
 The Fill ring is used to transfer ownership of UMEM frames from
-user-space to kernel-space. The UMEM indicies are passed in the
-ring. As an example, if the UMEM is 64k and each frame is 4k, then the
-UMEM has 16 frames and can pass indicies between 0 and 15.
+user-space to kernel-space. The UMEM addrs are passed in the ring. As
+an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
+16 chunks and can pass addrs between 0 and 64k.
 
 Frames passed to the kernel are used for the ingress path (RX rings).
 
-The user application produces UMEM indicies to this ring.
+The user application produces UMEM addrs to this ring. Note that the
+kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
+log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
+and 3000 refers to the same chunk.
+
 
 UMEM Completetion Ring
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -165,16 +169,15 @@ used.
 Frames passed from the kernel to user-space are frames that has been
 sent (TX ring) and can be used by user-space again.
 
-The user application consumes UMEM indicies from this ring.
+The user application consumes UMEM addrs from this ring.
 
 
 RX Ring
 ~~~~~~~
 
 The RX ring is the receiving side of a socket. Each entry in the ring
-is a struct xdp_desc descriptor. The descriptor contains UMEM index
-(idx), the length of the data (len), the offset into the frame
-(offset).
+is a struct xdp_desc descriptor. The descriptor contains UMEM offset
+(addr) and the length of the data (len).
 
 If no frames have been passed to kernel via the Fill ring, no
 descriptors will (or can) appear on the RX ring.
@@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c.
 
 Naive ring dequeue and enqueue could look like this::
 
+    // struct xdp_rxtx_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	struct xdp_desc *desc;
+    // };
+
+    // struct xdp_umem_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	__u64 *desc;
+    // };
+
     // typedef struct xdp_rxtx_ring RING;
     // typedef struct xdp_umem_ring RING;
 
     // typedef struct xdp_desc RING_TYPE;
-    // typedef __u32 RING_TYPE;
+    // typedef __u64 RING_TYPE;
 
     int dequeue_one(RING *ring, RING_TYPE *item)
     {
-        __u32 entries = ring->ptrs.producer - ring->ptrs.consumer;
+        __u32 entries = *ring->producer - *ring->consumer;
 
         if (entries == 0)
             return -1;
 
         // read-barrier!
 
-        *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)];
-        ring->ptrs.consumer++;
+        *item = ring->desc[*ring->consumer & (RING_SIZE - 1)];
+        (*ring->consumer)++;
         return 0;
     }
 
     int enqueue_one(RING *ring, const RING_TYPE *item)
     {
-        u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer);
+        u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer);
 
         if (free_entries == 0)
             return -1;
 
-        ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item;
+        ring->desc[*ring->producer & (RING_SIZE - 1)] = *item;
 
         // write-barrier!
 
-        ring->ptrs.producer++;
+        (*ring->producer)++;
         return 0;
     }
 
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 4737cfe222f5..e411d6f9ac65 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -48,8 +48,8 @@ struct xdp_mmap_offsets {
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
 	__u64 len; /* Length of packet data area */
-	__u32 frame_size; /* Frame size */
-	__u32 frame_headroom; /* Frame head room */
+	__u32 chunk_size;
+	__u32 headroom;
 };
 
 struct xdp_statistics {
@@ -66,13 +66,11 @@ struct xdp_statistics {
 
 /* Rx/Tx descriptor */
 struct xdp_desc {
-	__u32 idx;
+	__u64 addr;
 	__u32 len;
-	__u16 offset;
-	__u8 flags;
-	__u8 padding[5];
+	__u32 options;
 };
 
-/* UMEM descriptor is __u32 */
+/* UMEM descriptor is __u64 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 87998818116f..9ad791ff4739 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,7 +14,7 @@
 
 #include "xdp_umem.h"
 
-#define XDP_UMEM_MIN_FRAME_SIZE 2048
+#define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
@@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
-	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	unsigned int nframes, nfpp;
 	int size_chk, err;
 
-	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
 		 * - huge pages, or*
 		 * - using an IOMMU, or
@@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(frame_size))
+	if (!is_power_of_2(chunk_size))
 		return -EINVAL;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if ((addr + size) < addr)
 		return -EINVAL;
 
-	nframes = (unsigned int)div_u64(size, frame_size);
-	if (nframes == 0 || nframes > UINT_MAX)
+	chunks = (unsigned int)div_u64(size, chunk_size);
+	if (chunks == 0)
 		return -EINVAL;
 
-	nfpp = PAGE_SIZE / frame_size;
-	if (nframes < nfpp || nframes % nfpp)
+	chunks_per_page = PAGE_SIZE / chunk_size;
+	if (chunks < chunks_per_page || chunks % chunks_per_page)
 		return -EINVAL;
 
-	frame_headroom = ALIGN(frame_headroom, 64);
+	headroom = ALIGN(headroom, 64);
 
-	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 	if (size_chk < 0)
 		return -EINVAL;
 
 	umem->pid = get_task_pid(current, PIDTYPE_PID);
-	umem->size = (size_t)size;
 	umem->address = (unsigned long)addr;
-	umem->props.frame_size = frame_size;
-	umem->props.nframes = nframes;
-	umem->frame_headroom = frame_headroom;
+	umem->props.chunk_mask = ~((u64)chunk_size - 1);
+	umem->props.size = size;
+	umem->headroom = headroom;
+	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
 
-	umem->frame_size_log2 = ilog2(frame_size);
-	umem->nfpp_mask = nfpp - 1;
-	umem->nfpplog2 = ilog2(nfpp);
 	refcount_set(&umem->users, 1);
 
 	err = xdp_umem_account_pages(umem);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 0881cf456230..aeadd1bcb72d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -18,35 +18,20 @@ struct xdp_umem {
 	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
-	u32 npgs;
-	u32 frame_headroom;
-	u32 nfpp_mask;
-	u32 nfpplog2;
-	u32 frame_size_log2;
+	u32 headroom;
+	u32 chunk_size_nohr;
 	struct user_struct *user;
 	struct pid *pid;
 	unsigned long address;
-	size_t size;
 	refcount_t users;
 	struct work_struct work;
+	u32 npgs;
 };
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
-{
-	u64 pg, off;
-	char *data;
-
-	pg = idx >> umem->nfpplog2;
-	off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
-
-	data = page_address(umem->pgs[pg]);
-	return data + off;
-}
-
-static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
-						    u32 idx)
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
+	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
+		(addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 2cf8ec485fd2..40eab10dfc49 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -7,8 +7,8 @@
 #define XDP_UMEM_PROPS_H_
 
 struct xdp_umem_props {
-	u32 frame_size;
-	u32 nframes;
+	u64 chunk_mask;
+	u64 size;
 };
 
 #endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 966307ce4b8e..4688c750df1d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 id, len = xdp->data_end - xdp->data;
+	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
+	u64 addr;
 	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id)) {
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
 		return -ENOSPC;
 	}
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, id, len,
-				      xs->umem->frame_headroom);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (!err)
-		xskq_discard_id(xs->umem->fq);
+		xskq_discard_addr(xs->umem->fq);
 	else
 		xs->rx_dropped++;
 
@@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
-	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
+	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 
-	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
+	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 
 	sock_wfree(skb);
 }
@@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	while (xskq_peek_desc(xs->tx, &desc)) {
 		char *buffer;
-		u32 id, len;
+		u64 addr;
+		u32 len;
 
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
 			goto out;
 		}
 
-		if (xskq_reserve_id(xs->umem->cq)) {
+		if (xskq_reserve_addr(xs->umem->cq)) {
 			err = -EAGAIN;
 			goto out;
 		}
@@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		}
 
 		skb_put(skb, len);
-		id = desc.idx;
-		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
+		addr = desc.addr;
+		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
 		if (unlikely(err)) {
 			kfree_skb(skb);
@@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
 		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index ebe85e59507e..6c32e92e98fc 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
 
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
-	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
 }
 
 static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index b5924e7aeb2b..337e5ad3b10e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -27,7 +27,7 @@ struct xdp_rxtx_ring {
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
-	u32 desc[0] ____cacheline_aligned_in_smp;
+	u64 desc[0] ____cacheline_aligned_in_smp;
 };
 
 struct xsk_queue {
@@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 
 /* UMEM queue */
 
-static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
+static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	if (unlikely(idx >= q->umem_props.nframes)) {
+	if (addr >= q->umem_props.size) {
 		q->invalid_descs++;
 		return false;
 	}
+
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		*id = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_id(q, *id))
-			return id;
+		*addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+		if (xskq_is_valid_addr(q, *addr))
+			return addr;
 
 		q->cons_tail++;
 	}
@@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 {
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
@@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 		smp_rmb();
 	}
 
-	return xskq_validate_id(q, id);
+	return xskq_validate_addr(q, addr);
 }
 
-static inline void xskq_discard_id(struct xsk_queue *q)
+static inline void xskq_discard_addr(struct xsk_queue *q)
 {
 	q->cons_tail++;
 }
 
-static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
+static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-	ring->desc[q->prod_tail++ & q->ring_mask] = id;
+	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
 	smp_wmb();
@@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
 	return 0;
 }
 
-static inline int xskq_reserve_id(struct xsk_queue *q)
+static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
 		return -ENOSPC;
@@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q)
 
 static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 {
-	u32 buff_len;
-
-	if (unlikely(d->idx >= q->umem_props.nframes)) {
-		q->invalid_descs++;
+	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
-	}
 
-	buff_len = q->umem_props.frame_size;
-	if (unlikely(d->len > buff_len || d->len == 0 ||
-		     d->offset > buff_len || d->offset + d->len > buff_len)) {
+	if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
+	    (d->addr & q->umem_props.chunk_mask)) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q)
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-					  u32 id, u32 len, u16 offset)
+					  u64 addr, u32 len)
 {
 	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 	unsigned int idx;
@@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 		return -ENOSPC;
 
 	idx = (q->prod_head++) & q->ring_mask;
-	ring->desc[idx].idx = id;
+	ring->desc[idx].addr = addr;
 	ring->desc[idx].len = len;
-	ring->desc[idx].offset = offset;
 
 	return 0;
 }
-- 
cgit v1.2.3


From a412ef54fc2eb81bb55428dcdcdaa2e38ae9bba5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:14 +0200
Subject: samples/bpf: adapted to new uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, the xdpsock sample application is adjusted to the new descriptor
format.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 84 ++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 48 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index e379eac034ac..b71a342b9082 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -46,6 +46,7 @@
 
 #define NUM_FRAMES 131072
 #define FRAME_HEADROOM 0
+#define FRAME_SHIFT 11
 #define FRAME_SIZE 2048
 #define NUM_DESCS 1024
 #define BATCH_SIZE 16
@@ -55,6 +56,7 @@
 
 #define DEBUG_HEXDUMP 0
 
+typedef __u64 u64;
 typedef __u32 u32;
 
 static unsigned long prev_time;
@@ -81,12 +83,12 @@ struct xdp_umem_uqueue {
 	u32 size;
 	u32 *producer;
 	u32 *consumer;
-	u32 *ring;
+	u64 *ring;
 	void *map;
 };
 
 struct xdp_umem {
-	char (*frames)[FRAME_SIZE];
+	char *frames;
 	struct xdp_umem_uqueue fq;
 	struct xdp_umem_uqueue cq;
 	int fd;
@@ -214,7 +216,7 @@ static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
 	for (i = 0; i < nb; i++) {
 		u32 idx = fq->cached_prod++ & fq->mask;
 
-		fq->ring[idx] = d[i].idx;
+		fq->ring[idx] = d[i].addr;
 	}
 
 	u_smp_wmb();
@@ -224,7 +226,7 @@ static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
 	return 0;
 }
 
-static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
 				      size_t nb)
 {
 	u32 i;
@@ -246,7 +248,7 @@ static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
 }
 
 static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
-					       u32 *d, size_t nb)
+					       u64 *d, size_t nb)
 {
 	u32 idx, i, entries = umem_nb_avail(cq, nb);
 
@@ -266,10 +268,9 @@ static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
 	return entries;
 }
 
-static inline void *xq_get_data(struct xdpsock *xsk, __u32 idx, __u32 off)
+static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
 {
-	lassert(idx < NUM_FRAMES);
-	return &xsk->umem->frames[idx][off];
+	return &xsk->umem->frames[addr];
 }
 
 static inline int xq_enq(struct xdp_uqueue *uq,
@@ -285,9 +286,8 @@ static inline int xq_enq(struct xdp_uqueue *uq,
 	for (i = 0; i < ndescs; i++) {
 		u32 idx = uq->cached_prod++ & uq->mask;
 
-		r[idx].idx = descs[i].idx;
+		r[idx].addr = descs[i].addr;
 		r[idx].len = descs[i].len;
-		r[idx].offset = descs[i].offset;
 	}
 
 	u_smp_wmb();
@@ -297,7 +297,7 @@ static inline int xq_enq(struct xdp_uqueue *uq,
 }
 
 static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
-				 __u32 idx, unsigned int ndescs)
+				 unsigned int id, unsigned int ndescs)
 {
 	struct xdp_desc *r = uq->ring;
 	unsigned int i;
@@ -308,9 +308,8 @@ static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
 	for (i = 0; i < ndescs; i++) {
 		u32 idx = uq->cached_prod++ & uq->mask;
 
-		r[idx].idx	= idx + i;
+		r[idx].addr	= (id + i) << FRAME_SHIFT;
 		r[idx].len	= sizeof(pkt_data) - 1;
-		r[idx].offset	= 0;
 	}
 
 	u_smp_wmb();
@@ -357,17 +356,21 @@ static void swap_mac_addresses(void *data)
 	*dst_addr = tmp;
 }
 
-#if DEBUG_HEXDUMP
-static void hex_dump(void *pkt, size_t length, const char *prefix)
+static void hex_dump(void *pkt, size_t length, u64 addr)
 {
-	int i = 0;
 	const unsigned char *address = (unsigned char *)pkt;
 	const unsigned char *line = address;
 	size_t line_size = 32;
 	unsigned char c;
+	char buf[32];
+	int i = 0;
 
+	if (!DEBUG_HEXDUMP)
+		return;
+
+	sprintf(buf, "addr=%llu", addr);
 	printf("length = %zu\n", length);
-	printf("%s | ", prefix);
+	printf("%s | ", buf);
 	while (length-- > 0) {
 		printf("%02X ", *address++);
 		if (!(++i % line_size) || (length == 0 && i % line_size)) {
@@ -382,12 +385,11 @@ static void hex_dump(void *pkt, size_t length, const char *prefix)
 			}
 			printf("\n");
 			if (length > 0)
-				printf("%s | ", prefix);
+				printf("%s | ", buf);
 		}
 	}
 	printf("\n");
 }
-#endif
 
 static size_t gen_eth_frame(char *frame)
 {
@@ -412,8 +414,8 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 
 	mr.addr = (__u64)bufs;
 	mr.len = NUM_FRAMES * FRAME_SIZE;
-	mr.frame_size = FRAME_SIZE;
-	mr.frame_headroom = FRAME_HEADROOM;
+	mr.chunk_size = FRAME_SIZE;
+	mr.headroom = FRAME_HEADROOM;
 
 	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
 	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
@@ -426,7 +428,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 			   &optlen) == 0);
 
 	umem->fq.map = mmap(0, off.fr.desc +
-			    FQ_NUM_DESCS * sizeof(u32),
+			    FQ_NUM_DESCS * sizeof(u64),
 			    PROT_READ | PROT_WRITE,
 			    MAP_SHARED | MAP_POPULATE, sfd,
 			    XDP_UMEM_PGOFF_FILL_RING);
@@ -439,7 +441,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	umem->fq.ring = umem->fq.map + off.fr.desc;
 
 	umem->cq.map = mmap(0, off.cr.desc +
-			     CQ_NUM_DESCS * sizeof(u32),
+			     CQ_NUM_DESCS * sizeof(u64),
 			     PROT_READ | PROT_WRITE,
 			     MAP_SHARED | MAP_POPULATE, sfd,
 			     XDP_UMEM_PGOFF_COMPLETION_RING);
@@ -451,14 +453,14 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	umem->cq.consumer = umem->cq.map + off.cr.consumer;
 	umem->cq.ring = umem->cq.map + off.cr.desc;
 
-	umem->frames = (char (*)[FRAME_SIZE])bufs;
+	umem->frames = bufs;
 	umem->fd = sfd;
 
 	if (opt_bench == BENCH_TXONLY) {
 		int i;
 
-		for (i = 0; i < NUM_FRAMES; i++)
-			(void)gen_eth_frame(&umem->frames[i][0]);
+		for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
+			(void)gen_eth_frame(&umem->frames[i]);
 	}
 
 	return umem;
@@ -472,7 +474,7 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	struct xdpsock *xsk;
 	bool shared = true;
 	socklen_t optlen;
-	u32 i;
+	u64 i;
 
 	sfd = socket(PF_XDP, SOCK_RAW, 0);
 	lassert(sfd >= 0);
@@ -508,7 +510,7 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	lassert(xsk->rx.map != MAP_FAILED);
 
 	if (!shared) {
-		for (i = 0; i < NUM_DESCS / 2; i++)
+		for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
 			lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
 				== 0);
 	}
@@ -727,7 +729,7 @@ static void kick_tx(int fd)
 
 static inline void complete_tx_l2fwd(struct xdpsock *xsk)
 {
-	u32 descs[BATCH_SIZE];
+	u64 descs[BATCH_SIZE];
 	unsigned int rcvd;
 	size_t ndescs;
 
@@ -749,7 +751,7 @@ static inline void complete_tx_l2fwd(struct xdpsock *xsk)
 
 static inline void complete_tx_only(struct xdpsock *xsk)
 {
-	u32 descs[BATCH_SIZE];
+	u64 descs[BATCH_SIZE];
 	unsigned int rcvd;
 
 	if (!xsk->outstanding_tx)
@@ -774,17 +776,9 @@ static void rx_drop(struct xdpsock *xsk)
 		return;
 
 	for (i = 0; i < rcvd; i++) {
-		u32 idx = descs[i].idx;
-
-		lassert(idx < NUM_FRAMES);
-#if DEBUG_HEXDUMP
-		char *pkt;
-		char buf[32];
+		char *pkt = xq_get_data(xsk, descs[i].addr);
 
-		pkt = xq_get_data(xsk, idx, descs[i].offset);
-		sprintf(buf, "idx=%d", idx);
-		hex_dump(pkt, descs[i].len, buf);
-#endif
+		hex_dump(pkt, descs[i].len, descs[i].addr);
 	}
 
 	xsk->rx_npkts += rcvd;
@@ -867,17 +861,11 @@ static void l2fwd(struct xdpsock *xsk)
 		}
 
 		for (i = 0; i < rcvd; i++) {
-			char *pkt = xq_get_data(xsk, descs[i].idx,
-						descs[i].offset);
+			char *pkt = xq_get_data(xsk, descs[i].addr);
 
 			swap_mac_addresses(pkt);
-#if DEBUG_HEXDUMP
-			char buf[32];
-			u32 idx = descs[i].idx;
 
-			sprintf(buf, "idx=%d", idx);
-			hex_dump(pkt, descs[i].len, buf);
-#endif
+			hex_dump(pkt, descs[i].len, descs[i].addr);
 		}
 
 		xsk->rx_npkts += rcvd;
-- 
cgit v1.2.3


From a65ea68b8dd92fea86d2b8ca7e43caeaa5ddcdff Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 13:57:15 +0200
Subject: samples/bpf: minor *_nb_free performance fix

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index b71a342b9082..7494f60fbff8 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -157,15 +157,15 @@ static const char pkt_data[] =
 
 static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
 {
-	u32 free_entries = q->size - (q->cached_prod - q->cached_cons);
+	u32 free_entries = q->cached_cons - q->cached_prod;
 
 	if (free_entries >= nb)
 		return free_entries;
 
 	/* Refresh the local tail pointer */
-	q->cached_cons = *q->consumer;
+	q->cached_cons = *q->consumer + q->size;
 
-	return q->size - (q->cached_prod - q->cached_cons);
+	return q->cached_cons - q->cached_prod;
 }
 
 static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
@@ -439,6 +439,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	umem->fq.producer = umem->fq.map + off.fr.producer;
 	umem->fq.consumer = umem->fq.map + off.fr.consumer;
 	umem->fq.ring = umem->fq.map + off.fr.desc;
+	umem->fq.cached_cons = FQ_NUM_DESCS;
 
 	umem->cq.map = mmap(0, off.cr.desc +
 			     CQ_NUM_DESCS * sizeof(u64),
@@ -535,6 +536,7 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	xsk->tx.producer = xsk->tx.map + off.tx.producer;
 	xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
 	xsk->tx.ring = xsk->tx.map + off.tx.desc;
+	xsk->tx.cached_cons = NUM_DESCS;
 
 	sxdp.sxdp_family = PF_XDP;
 	sxdp.sxdp_ifindex = opt_ifindex;
-- 
cgit v1.2.3


From 34ea38ca27991466a8fff849514b4181b42ae2eb Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 4 Jun 2018 08:53:41 -0700
Subject: bpf: guard bpf_get_current_cgroup_id() with CONFIG_CGROUPS

Commit bf6fa2c893c5 ("bpf: implement bpf_get_current_cgroup_id()
helper") introduced a new helper bpf_get_current_cgroup_id().
The helper has a dependency on CONFIG_CGROUPS.

When CONFIG_CGROUPS is not defined, using the helper will result
the following verifier error:
  kernel subsystem misconfigured func bpf_get_current_cgroup_id#80
which is hard for users to interpret.
Guarding the reference to bpf_get_current_cgroup_id_proto with
CONFIG_CGROUPS will result in below better message:
  unknown func bpf_get_current_cgroup_id#80

Fixes: bf6fa2c893c5 ("bpf: implement bpf_get_current_cgroup_id() helper")
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e2ab5b7f29d2..0ae6829804bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -564,8 +564,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+#ifdef CONFIG_CGROUPS
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
+#endif
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 2b589a7e2bd3eb610a4b7f5e61393481755a4de9 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Fri, 11 May 2018 11:06:34 +0800
Subject: bpf, arm32: correct check_imm24

imm24 is signed, so the right range is:

  [-(1<<(24 - 1)), (1<<(24 - 1)) - 1]

Note: this patch also fix a typo.

Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler")
Signed-off-by: Wang YanQing <udknight@gmail.com>
Cc: Shubham Bansal <illusionist.neo@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux@armlinux.org.uk
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm/net/bpf_jit_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index d3ea6454e775..0d542007b49d 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -84,7 +84,7 @@
  *
  * 1. First argument is passed using the arm 32bit registers and rest of the
  * arguments are passed on stack scratch space.
- * 2. First callee-saved arugument is mapped to arm 32 bit registers and rest
+ * 2. First callee-saved argument is mapped to arm 32 bit registers and rest
  * arguments are mapped to scratch space on stack.
  * 3. We need two 64 bit temp registers to do complex operations on eBPF
  * registers.
@@ -1192,8 +1192,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	s32 jmp_offset;
 
 #define check_imm(bits, imm) do {				\
-	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
-	    (((imm) < 0) && (~(imm) >> (bits)))) {		\
+	if ((imm) >= (1 << ((bits) - 1)) ||			\
+	    (imm) < -(1 << ((bits) - 1))) {			\
 		pr_info("[%2d] imm=%d(0x%x) out of range\n",	\
 			i, imm, imm);				\
 		return -EINVAL;					\
-- 
cgit v1.2.3


From 68565a1af9f7012e6f2fe2bdd612f67d2d830c28 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Fri, 11 May 2018 10:52:17 +0800
Subject: bpf, arm32: fix inconsistent naming about emit_a32_lsr_{r64,i64}

The names for BPF_ALU64 | BPF_ARSH are emit_a32_arsh_*,
the names for BPF_ALU64 | BPF_LSH are emit_a32_lsh_*, but
the names for BPF_ALU64 | BPF_RSH are emit_a32_lsr_*.

For consistence reason, let's rename emit_a32_lsr_* to
emit_a32_rsh_*.

This patch also corrects a wrong comment.

Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler")
Signed-off-by: Wang YanQing <udknight@gmail.com>
Cc: Shubham Bansal <illusionist.neo@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux@armlinux.org.uk
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm/net/bpf_jit_32.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 0d542007b49d..6e8b71613039 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -701,7 +701,7 @@ static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk,
 }
 
 /* dst = dst >> src */
-static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk,
+static inline void emit_a32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
 				     bool sstk, struct jit_ctx *ctx) {
 	const u8 *tmp = bpf2a32[TMP_REG_1];
 	const u8 *tmp2 = bpf2a32[TMP_REG_2];
@@ -717,7 +717,7 @@ static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk,
 		emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
 	}
 
-	/* Do LSH operation */
+	/* Do RSH operation */
 	emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
 	emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
 	emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx);
@@ -767,7 +767,7 @@ static inline void emit_a32_lsh_i64(const u8 dst[], bool dstk,
 }
 
 /* dst = dst >> val */
-static inline void emit_a32_lsr_i64(const u8 dst[], bool dstk,
+static inline void emit_a32_rsh_i64(const u8 dst[], bool dstk,
 				    const u32 val, struct jit_ctx *ctx) {
 	const u8 *tmp = bpf2a32[TMP_REG_1];
 	const u8 *tmp2 = bpf2a32[TMP_REG_2];
@@ -1323,7 +1323,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	case BPF_ALU64 | BPF_RSH | BPF_K:
 		if (unlikely(imm > 63))
 			return -EINVAL;
-		emit_a32_lsr_i64(dst, dstk, imm, ctx);
+		emit_a32_rsh_i64(dst, dstk, imm, ctx);
 		break;
 	/* dst = dst << src */
 	case BPF_ALU64 | BPF_LSH | BPF_X:
@@ -1331,7 +1331,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		break;
 	/* dst = dst >> src */
 	case BPF_ALU64 | BPF_RSH | BPF_X:
-		emit_a32_lsr_r64(dst, src, dstk, sstk, ctx);
+		emit_a32_rsh_r64(dst, src, dstk, sstk, ctx);
 		break;
 	/* dst = dst >> src (signed) */
 	case BPF_ALU64 | BPF_ARSH | BPF_X:
-- 
cgit v1.2.3


From 763ea096f3cf312608317fb1027d509cfd1efc16 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:30 +0200
Subject: i40e: remove ndo_xdp_flush call i40e_xdp_flush

Remove the ndo_xdp_flush call implementation i40e_xdp_flush
as no callers of ndo_xdp_flush are left.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  1 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 19 -------------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 -
 3 files changed, 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b5daa5c9c7de..c944bd10b03d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11883,7 +11883,6 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
 	.ndo_bpf		= i40e_xdp,
 	.ndo_xdp_xmit		= i40e_xdp_xmit,
-	.ndo_xdp_flush		= i40e_xdp_flush,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5f01e4ce9c92..713995d04783 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3707,22 +3707,3 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 
 	return n - drops;
 }
-
-/**
- * i40e_xdp_flush - Implements ndo_xdp_flush
- * @dev: netdev
- **/
-void i40e_xdp_flush(struct net_device *dev)
-{
-	struct i40e_netdev_priv *np = netdev_priv(dev);
-	unsigned int queue_index = smp_processor_id();
-	struct i40e_vsi *vsi = np->vsi;
-
-	if (test_bit(__I40E_VSI_DOWN, vsi->state))
-		return;
-
-	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
-		return;
-
-	i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
-}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 820f76db251b..bb04f6a731fe 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -489,7 +489,6 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
 int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
-void i40e_xdp_flush(struct net_device *dev);
 
 /**
  * i40e_get_head - Retrieve head from head writeback
-- 
cgit v1.2.3


From af3746a779aa5106fb3d1771b8fb28522476b073 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:35 +0200
Subject: ixgbe: remove ndo_xdp_flush call ixgbe_xdp_flush

Remove the ndo_xdp_flush call implementation ixgbe_xdp_flush
as no callers of ndo_xdp_flush are left.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4fd77c9067f2..ef1afb3a8a97 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10069,26 +10069,6 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	return n - drops;
 }
 
-static void ixgbe_xdp_flush(struct net_device *dev)
-{
-	struct ixgbe_adapter *adapter = netdev_priv(dev);
-	struct ixgbe_ring *ring;
-
-	/* Its possible the device went down between xdp xmit and flush so
-	 * we need to ensure device is still up.
-	 */
-	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
-		return;
-
-	ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL;
-	if (unlikely(!ring))
-		return;
-
-	ixgbe_xdp_ring_update_tail(ring);
-
-	return;
-}
-
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
@@ -10136,7 +10116,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_features_check	= ixgbe_features_check,
 	.ndo_bpf		= ixgbe_xdp,
 	.ndo_xdp_xmit		= ixgbe_xdp_xmit,
-	.ndo_xdp_flush		= ixgbe_xdp_flush,
 };
 
 /**
-- 
cgit v1.2.3


From 16ec025aa5077144747c97a1e40635f547457a95 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:40 +0200
Subject: virtio_net: remove ndo_xdp_flush call virtnet_xdp_flush

Remove the ndo_xdp_flush call implementation virtnet_xdp_flush
as no callers of ndo_xdp_flush are left.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/virtio_net.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 62ba8aadd8e6..8c5b59e79439 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -407,18 +407,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
-static void virtnet_xdp_flush(struct net_device *dev)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-	struct send_queue *sq;
-	unsigned int qp;
-
-	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-	sq = &vi->sq[qp];
-
-	virtqueue_kick(sq->vq);
-}
-
 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 				   struct send_queue *sq,
 				   struct xdp_frame *xdpf)
@@ -2359,7 +2347,6 @@ static const struct net_device_ops virtnet_netdev = {
 #endif
 	.ndo_bpf		= virtnet_xdp,
 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
-	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
 };
 
-- 
cgit v1.2.3


From 42421a56541a3b0fb6656406b657ad4686cdaaeb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:45 +0200
Subject: tun: remove ndo_xdp_flush call tun_xdp_flush

Remove the ndo_xdp_flush call implementation tun_xdp_flush
as no callers of ndo_xdp_flush are left.

The tun drivers XDP_TX implementation also used tun_xdp_flush (and
tun_xdp_xmit).  This is easily solved by passing the XDP_XMIT_FLUSH
flag to tun_xdp_xmit in tun_xdp_tx.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/tun.c | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d82a05fb0594..ef09224496e8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1347,26 +1347,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, 1, &frame, 0);
-}
-
-static void tun_xdp_flush(struct net_device *dev)
-{
-	struct tun_struct *tun = netdev_priv(dev);
-	struct tun_file *tfile;
-	u32 numqueues;
-
-	rcu_read_lock();
-
-	numqueues = READ_ONCE(tun->numqueues);
-	if (!numqueues)
-		goto out;
-
-	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
-					    numqueues]);
-	__tun_xdp_flush_tfile(tfile);
-out:
-	rcu_read_unlock();
+	return tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
 }
 
 static const struct net_device_ops tap_netdev_ops = {
@@ -1387,7 +1368,6 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_xdp_xmit		= tun_xdp_xmit,
-	.ndo_xdp_flush		= tun_xdp_flush,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -1706,7 +1686,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 			alloc_frag->offset += buflen;
 			if (tun_xdp_tx(tun->dev, &xdp))
 				goto err_redirect;
-			tun_xdp_flush(tun->dev);
 			rcu_read_unlock();
 			preempt_enable();
 			return NULL;
-- 
cgit v1.2.3


From 189454e868c45ce3476bfac03ace921dec34208a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:50 +0200
Subject: net: remove net_device operation ndo_xdp_flush

All drivers are cleaned up and no references to ndo_xdp_flush
are left in drivers, it is time to remove the net_device_ops
operation ndo_xdp_flush.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f17785a59d7..42c6ea35a6f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1192,9 +1192,6 @@ struct dev_ifalias {
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- * void (*ndo_xdp_flush)(struct net_device *dev);
- *	This function is used to inform the driver to flush a particular
- *	xdp tx queue. Must be called on same CPU as xdp_xmit.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1382,7 +1379,6 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
-	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
 /**
-- 
cgit v1.2.3


From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:51 +0200
Subject: xsk: moved struct xdp_umem definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy
support.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h | 24 +++++++++++++++++++++++-
 net/xdp/xdp_umem.c     |  1 +
 net/xdp/xdp_umem.h     | 22 +---------------------
 net/xdp/xsk_queue.h    |  3 +--
 4 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 7a647c56ec15..3a6cd88f179d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -6,12 +6,34 @@
 #ifndef _LINUX_XDP_SOCK_H
 #define _LINUX_XDP_SOCK_H
 
+#include <linux/workqueue.h>
+#include <linux/if_xdp.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
 #include <net/sock.h>
 
 struct net_device;
 struct xsk_queue;
-struct xdp_umem;
+
+struct xdp_umem_props {
+	u64 chunk_mask;
+	u64 size;
+};
+
+struct xdp_umem {
+	struct xsk_queue *fq;
+	struct xsk_queue *cq;
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 headroom;
+	u32 chunk_size_nohr;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	refcount_t users;
+	struct work_struct work;
+	u32 npgs;
+};
 
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9ad791ff4739..2793a503223e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 
 #include "xdp_umem.h"
+#include "xsk_queue.h"
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index aeadd1bcb72d..9433e8af650a 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -6,27 +6,7 @@
 #ifndef XDP_UMEM_H_
 #define XDP_UMEM_H_
 
-#include <linux/mm.h>
-#include <linux/if_xdp.h>
-#include <linux/workqueue.h>
-
-#include "xsk_queue.h"
-#include "xdp_umem_props.h"
-
-struct xdp_umem {
-	struct xsk_queue *fq;
-	struct xsk_queue *cq;
-	struct page **pgs;
-	struct xdp_umem_props props;
-	u32 headroom;
-	u32 chunk_size_nohr;
-	struct user_struct *user;
-	struct pid *pid;
-	unsigned long address;
-	refcount_t users;
-	struct work_struct work;
-	u32 npgs;
-};
+#include <net/xdp_sock.h>
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 337e5ad3b10e..5246ed420a16 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -8,8 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/if_xdp.h>
-
-#include "xdp_umem_props.h"
+#include <net/xdp_sock.h>
 
 #define RX_BATCH_SIZE 16
 
-- 
cgit v1.2.3


From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:52 +0200
Subject: xsk: introduce xdp_umem_page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xdp_umem_page holds the address for a page. Trade memory for
faster lookup. Later, we'll add DMA address here as well.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  7 ++++++-
 net/xdp/xdp_umem.c     | 15 ++++++++++++++-
 net/xdp/xdp_umem.h     |  3 +--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3a6cd88f179d..caf343a7e224 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -20,10 +20,14 @@ struct xdp_umem_props {
 	u64 size;
 };
 
+struct xdp_umem_page {
+	void *addr;
+};
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
-	struct page **pgs;
+	struct xdp_umem_page *pages;
 	struct xdp_umem_props props;
 	u32 headroom;
 	u32 chunk_size_nohr;
@@ -32,6 +36,7 @@ struct xdp_umem {
 	unsigned long address;
 	refcount_t users;
 	struct work_struct work;
+	struct page **pgs;
 	u32 npgs;
 };
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2793a503223e..aca826011f6c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		goto out;
 
 	mmput(mm);
+	kfree(umem->pages);
+	umem->pages = NULL;
+
 	xdp_umem_unaccount_pages(umem);
 out:
 	kfree(umem);
@@ -155,7 +158,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	int size_chk, err;
+	int size_chk, err, i;
 
 	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
@@ -213,6 +216,16 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	err = xdp_umem_pin_pages(umem);
 	if (err)
 		goto out_account;
+
+	umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+	if (!umem->pages) {
+		err = -ENOMEM;
+		goto out_account;
+	}
+
+	for (i = 0; i < umem->npgs; i++)
+		umem->pages[i].addr = page_address(umem->pgs[i]);
+
 	return 0;
 
 out_account:
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 9433e8af650a..40e8fa4a92af 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,8 +10,7 @@
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
-		(addr & (PAGE_SIZE - 1));
+	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
-- 
cgit v1.2.3


From 74515c5750f30244a901c3c0c82a2fe534b3c9c5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:53 +0200
Subject: net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend ndo_bpf with two new commands used for query zero-copy support
and register an UMEM to a queue_id of a netdev.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 42c6ea35a6f2..cc4ea7ab6d24 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -817,10 +817,13 @@ enum bpf_netdev_command {
 	BPF_OFFLOAD_DESTROY,
 	BPF_OFFLOAD_MAP_ALLOC,
 	BPF_OFFLOAD_MAP_FREE,
+	XDP_QUERY_XSK_UMEM,
+	XDP_SETUP_XSK_UMEM,
 };
 
 struct bpf_prog_offload_ops;
 struct netlink_ext_ack;
+struct xdp_umem;
 
 struct netdev_bpf {
 	enum bpf_netdev_command command;
@@ -851,6 +854,11 @@ struct netdev_bpf {
 		struct {
 			struct bpf_offloaded_map *offmap;
 		};
+		/* XDP_SETUP_XSK_UMEM */
+		struct {
+			struct xdp_umem *umem;
+			u16 queue_id;
+		} xsk;
 	};
 };
 
-- 
cgit v1.2.3


From 02b55e5657c3a569fc681ba851e464cfa6b90d4f Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:54 +0200
Subject: xdp: add MEM_TYPE_ZERO_COPY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, a new type of allocator support is added to the XDP return
API. A zero-copy allocated xdp_buff cannot be converted to an
xdp_frame. Instead is the buff has to be copied. This is not supported
at all in this commit.

Also, an opaque "handle" is added to xdp_buff. This can be used as a
context for the zero-copy allocator implementation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp.h | 10 ++++++++++
 net/core/xdp.c    | 19 ++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index a3b71a4dd71d..2deea7166a34 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -37,6 +37,7 @@ enum xdp_mem_type {
 	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
 	MEM_TYPE_PAGE_POOL,
+	MEM_TYPE_ZERO_COPY,
 	MEM_TYPE_MAX,
 };
 
@@ -51,6 +52,10 @@ struct xdp_mem_info {
 
 struct page_pool;
 
+struct zero_copy_allocator {
+	void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
@@ -63,6 +68,7 @@ struct xdp_buff {
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
+	unsigned long handle;
 	struct xdp_rxq_info *rxq;
 };
 
@@ -86,6 +92,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	int metasize;
 	int headroom;
 
+	/* TODO: implement clone, copy, use "native" MEM_TYPE */
+	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+		return NULL;
+
 	/* Assure headroom is available for storing info */
 	headroom = xdp->data - xdp->data_hard_start;
 	metasize = xdp->data - xdp->data_meta;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index cb8c4e061a5a..9d1f22072d5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -31,6 +31,7 @@ struct xdp_mem_allocator {
 	union {
 		void *allocator;
 		struct page_pool *page_pool;
+		struct zero_copy_allocator *zc_alloc;
 	};
 	struct rhash_head node;
 	struct rcu_head rcu;
@@ -261,7 +262,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 	xdp_rxq->mem.type = type;
 
 	if (!allocator) {
-		if (type == MEM_TYPE_PAGE_POOL)
+		if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
 			return -EINVAL; /* Setup time check page_pool req */
 		return 0;
 	}
@@ -314,7 +315,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * is used for those calls sites.  Thus, allowing for faster recycling
  * of xdp_frames/pages in those cases.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+			 unsigned long handle)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -338,6 +340,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
 		break;
+	case MEM_TYPE_ZERO_COPY:
+		/* NB! Only valid from an xdp_buff! */
+		rcu_read_lock();
+		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		xa->zc_alloc->free(xa->zc_alloc, handle);
+		rcu_read_unlock();
 	default:
 		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
 		break;
@@ -346,18 +355,18 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, false);
+	__xdp_return(xdpf->data, &xdpf->mem, false, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, true);
+	__xdp_return(xdpf->data, &xdpf->mem, true, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	__xdp_return(xdp->data, &xdp->rxq->mem, true);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
-- 
cgit v1.2.3


From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:55 +0200
Subject: xsk: add zero-copy support for Rx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and
wireup ndo_bpf call in bind.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      |  6 +++
 include/uapi/linux/if_xdp.h |  4 +-
 net/xdp/xdp_umem.c          | 77 ++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  3 ++
 net/xdp/xsk.c               | 96 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 165 insertions(+), 21 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index caf343a7e224..d93d3aac3fc9 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -22,6 +22,7 @@ struct xdp_umem_props {
 
 struct xdp_umem_page {
 	void *addr;
+	dma_addr_t dma;
 };
 
 struct xdp_umem {
@@ -38,6 +39,9 @@ struct xdp_umem {
 	struct work_struct work;
 	struct page **pgs;
 	u32 npgs;
+	struct net_device *dev;
+	u16 queue_id;
+	bool zc;
 };
 
 struct xdp_sock {
@@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
+void xsk_umem_discard_addr(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e411d6f9ac65..1fa0e977ea8d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -13,7 +13,9 @@
 #include <linux/types.h>
 
 /* Options for the sxdp_flags field */
-#define XDP_SHARED_UMEM 1
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index aca826011f6c..f729d79b8d91 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,81 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags)
+{
+	bool force_zc, force_copy;
+	struct netdev_bpf bpf;
+	int err;
+
+	force_zc = flags & XDP_ZEROCOPY;
+	force_copy = flags & XDP_COPY;
+
+	if (force_zc && force_copy)
+		return -EINVAL;
+
+	if (force_copy)
+		return 0;
+
+	dev_hold(dev);
+
+	if (dev->netdev_ops->ndo_bpf) {
+		bpf.command = XDP_QUERY_XSK_UMEM;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? -ENOTSUPP : 0;
+		}
+
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = umem;
+		bpf.xsk.queue_id = queue_id;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? err : 0; /* fail or fallback */
+		}
+
+		umem->dev = dev;
+		umem->queue_id = queue_id;
+		umem->zc = true;
+		return 0;
+	}
+
+	dev_put(dev);
+	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+}
+
+void xdp_umem_clear_dev(struct xdp_umem *umem)
+{
+	struct netdev_bpf bpf;
+	int err;
+
+	if (umem->dev) {
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = NULL;
+		bpf.xsk.queue_id = umem->queue_id;
+
+		rtnl_lock();
+		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
+		rtnl_unlock();
+
+		if (err)
+			WARN(1, "failed to disable umem!\n");
+
+		dev_put(umem->dev);
+		umem->dev = NULL;
+	}
+}
+
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unsigned int i;
@@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	xdp_umem_clear_dev(umem);
+
 	if (umem->fq) {
 		xskq_destroy(umem->fq);
 		umem->fq = NULL;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 40e8fa4a92af..674508a32a4d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4688c750df1d..ab64bd8260ea 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
-	return !!xs->rx;
+	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
+		READ_ONCE(xs->umem->fq);
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+	return xskq_peek_addr(umem->fq, addr);
+}
+EXPORT_SYMBOL(xsk_umem_peek_addr);
+
+void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+	xskq_discard_addr(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_discard_addr);
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
 	u64 addr;
 	int err;
 
-	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
-		return -EINVAL;
-
 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
@@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
-	if (!err)
+	if (!err) {
 		xskq_discard_addr(xs->umem->fq);
-	else
-		xs->rx_dropped++;
+		xdp_return_buff(xdp);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	int err;
+	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 
-	err = __xsk_rcv(xs, xdp);
-	if (likely(!err))
+	if (err) {
 		xdp_return_buff(xdp);
+		xs->rx_dropped++;
+	}
 
 	return err;
 }
 
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 len;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	len = xdp->data_end - xdp->data;
+
+	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
+		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+}
+
 void xsk_flush(struct xdp_sock *xs)
 {
 	xskq_produce_flush_desc(xs->rx);
@@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs)
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
+	u32 len = xdp->data_end - xdp->data;
+	void *buffer;
+	u64 addr;
 	int err;
 
-	err = __xsk_rcv(xs, xdp);
-	if (!err)
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
+		xs->rx_dropped++;
+		return -ENOSPC;
+	}
+
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
+	memcpy(buffer, xdp->data, len);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
+	if (!err) {
+		xskq_discard_addr(xs->umem->fq);
 		xsk_flush(xs);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
@@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 	struct net_device *dev;
+	u32 flags, qid;
 	int err = 0;
 
 	if (addr_len < sizeof(struct sockaddr_xdp))
@@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	}
 
-	if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
-	    (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
+	qid = sxdp->sxdp_queue_id;
+
+	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
+	    (xs->tx && qid >= dev->real_num_tx_queues)) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
 
-	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+	flags = sxdp->sxdp_flags;
+
+	if (flags & XDP_SHARED_UMEM) {
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+			/* Cannot specify flags for shared sockets. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
 		if (xs->umem) {
 			/* We have already our own. */
 			err = -EINVAL;
@@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 			err = -EBADF;
 			sockfd_put(sock);
 			goto out_unlock;
-		} else if (umem_xs->dev != dev ||
-			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 			err = -EINVAL;
 			sockfd_put(sock);
 			goto out_unlock;
@@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
+
+		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
+		if (err)
+			goto out_unlock;
 	}
 
 	xs->dev = dev;
-- 
cgit v1.2.3


From e3760c7e50ac6cdf1188fec44938dd7e6e6eef61 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 14:05:56 +0200
Subject: net: added netdevice operation for Tx

Added ndo_xsk_async_xmit. This ndo "kicks" the netdev to start to pull
userland AF_XDP Tx frames from a NAPI context.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc4ea7ab6d24..03ffeadf8a41 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,8 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
+	int			(*ndo_xsk_async_xmit)(struct net_device *dev,
+						      u32 queue_id);
 };
 
 /**
-- 
cgit v1.2.3


From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 14:05:57 +0200
Subject: xsk: wire upp Tx zero-copy functions

Here we add the functionality required to support zero-copy Tx, and
also exposes various zero-copy related functions for the netdevs.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  9 +++++++
 net/xdp/xdp_umem.c     | 29 +++++++++++++++++++--
 net/xdp/xdp_umem.h     |  8 +++++-
 net/xdp/xsk.c          | 70 +++++++++++++++++++++++++++++++++++++++++++++-----
 net/xdp/xsk_queue.h    | 32 ++++++++++++++++++++++-
 5 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d93d3aac3fc9..9fe472f2ac95 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -9,6 +9,7 @@
 #include <linux/workqueue.h>
 #include <linux/if_xdp.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <net/sock.h>
 
@@ -42,6 +43,8 @@ struct xdp_umem {
 	struct net_device *dev;
 	u16 queue_id;
 	bool zc;
+	spinlock_t xsk_list_lock;
+	struct list_head xsk_list;
 };
 
 struct xdp_sock {
@@ -53,6 +56,8 @@ struct xdp_sock {
 	struct list_head flush_node;
 	u16 queue_id;
 	struct xsk_queue *tx ____cacheline_aligned_in_smp;
+	struct list_head list;
+	bool zc;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	u64 rx_dropped;
@@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+/* Used from netdev driver */
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
 void xsk_umem_discard_addr(struct xdp_umem *umem);
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f729d79b8d91..7eb4948a38d2 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,29 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&umem->xsk_list_lock, flags);
+	list_add_rcu(&xs->list, &umem->xsk_list);
+	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+}
+
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	if (xs->dev) {
+		spin_lock_irqsave(&umem->xsk_list_lock, flags);
+		list_del_rcu(&xs->list);
+		spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+
+		if (umem->zc)
+			synchronize_net();
+	}
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags)
 {
@@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 
 	dev_hold(dev);
 
-	if (dev->netdev_ops->ndo_bpf) {
+	if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
 		bpf.command = XDP_QUERY_XSK_UMEM;
 
 		rtnl_lock();
@@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
 }
 
-void xdp_umem_clear_dev(struct xdp_umem *umem)
+static void xdp_umem_clear_dev(struct xdp_umem *umem)
 {
 	struct netdev_bpf bpf;
 	int err;
@@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
+	INIT_LIST_HEAD(&umem->xsk_list);
+	spin_lock_init(&umem->xsk_list_lock);
 
 	refcount_set(&umem->users, 1);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 674508a32a4d..f11560334f88 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags);
-void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
 
 #endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ab64bd8260ea..ddca4bf1cfc8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/net.h>
 #include <linux/netdevice.h>
+#include <linux/rculist.h>
 #include <net/xdp_sock.h>
 #include <net/xdp.h>
 
@@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return err;
 }
 
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+	xskq_produce_flush_addr_n(umem->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_umem_complete_tx);
+
+void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->sk.sk_write_space(&xs->sk);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+	struct xdp_desc desc;
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		if (!xskq_peek_desc(xs->tx, &desc))
+			continue;
+
+		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+			goto out;
+
+		*dma = xdp_umem_get_dma(umem, desc.addr);
+		*len = desc.len;
+
+		xskq_discard_desc(xs->tx);
+		rcu_read_unlock();
+		return true;
+	}
+
+out:
+	rcu_read_unlock();
+	return false;
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx);
+
+static int xsk_zc_xmit(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct net_device *dev = xs->dev;
+
+	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+}
+
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
@@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			    size_t total_len)
 {
-	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 	u32 max_batch = TX_BATCH_SIZE;
 	struct xdp_sock *xs = xdp_sk(sk);
 	bool sent_frame = false;
@@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	if (unlikely(!xs->tx))
 		return -ENOBUFS;
-	if (need_wait)
-		return -EOPNOTSUPP;
 
 	mutex_lock(&xs->mutex);
 
@@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
-		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
+		skb = sock_alloc_send_skb(sk, len, 1, &err);
 		if (unlikely(!skb)) {
 			err = -EAGAIN;
 			goto out;
@@ -235,6 +286,7 @@ out:
 
 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
+	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
@@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 		return -ENXIO;
 	if (unlikely(!(xs->dev->flags & IFF_UP)))
 		return -ENETDOWN;
+	if (need_wait)
+		return -EOPNOTSUPP;
 
-	return xsk_generic_xmit(sk, m, total_len);
+	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 }
 
 static unsigned int xsk_poll(struct file *file, struct socket *sock,
@@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	}
 
 	xs->dev = dev;
-	xs->queue_id = sxdp->sxdp_queue_id;
-
+	xs->zc = xs->umem->zc;
+	xs->queue_id = qid;
 	xskq_set_umem(xs->rx, &xs->umem->props);
 	xskq_set_umem(xs->tx, &xs->umem->props);
+	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
 	if (err)
@@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk)
 
 	xskq_destroy(xs->rx);
 	xskq_destroy(xs->tx);
+	xdp_del_sk_umem(xs->umem, xs);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 5246ed420a16..ef6a6f0ec949 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -11,6 +11,7 @@
 #include <net/xdp_sock.h>
 
 #define RX_BATCH_SIZE 16
+#define LAZY_UPDATE_THRESHOLD 128
 
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
@@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 	return (entries > dcnt) ? dcnt : entries;
 }
 
+static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
+{
+	return q->nentries - (producer - q->cons_tail);
+}
+
 static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 {
-	u32 free_entries = q->nentries - (producer - q->cons_tail);
+	u32 free_entries = xskq_nb_free_lazy(q, producer);
 
 	if (free_entries >= dcnt)
 		return free_entries;
@@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
+	if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+		return -ENOSPC;
+
 	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
@@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
+		return -ENOSPC;
+
+	ring->desc[q->prod_head++ & q->ring_mask] = addr;
+	return 0;
+}
+
+static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
+					     u32 nb_entries)
+{
+	/* Order producer and data */
+	smp_wmb();
+
+	q->prod_tail += nb_entries;
+	WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
 static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
-- 
cgit v1.2.3


From 9f5232cc7f040f443f81069f553d31b27ab7eb79 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:06:01 +0200
Subject: samples/bpf: xdpsock: use skb Tx path for XDP_SKB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure that XDP_SKB also uses the skb Tx path.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 7494f60fbff8..d69c8d78d3fd 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -75,6 +75,7 @@ static int opt_queue;
 static int opt_poll;
 static int opt_shared_packet_buffer;
 static int opt_interval = 1;
+static u32 opt_xdp_bind_flags;
 
 struct xdp_umem_uqueue {
 	u32 cached_prod;
@@ -541,9 +542,12 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	sxdp.sxdp_family = PF_XDP;
 	sxdp.sxdp_ifindex = opt_ifindex;
 	sxdp.sxdp_queue_id = opt_queue;
+
 	if (shared) {
 		sxdp.sxdp_flags = XDP_SHARED_UMEM;
 		sxdp.sxdp_shared_umem_fd = umem->fd;
+	} else {
+		sxdp.sxdp_flags = opt_xdp_bind_flags;
 	}
 
 	lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
@@ -699,6 +703,7 @@ static void parse_command_line(int argc, char **argv)
 			break;
 		case 'S':
 			opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
+			opt_xdp_bind_flags |= XDP_COPY;
 			break;
 		case 'N':
 			opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
-- 
cgit v1.2.3