summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/arm/net/bpf_jit_32.c2
-rw-r--r--arch/s390/net/bpf_jit_comp.c1
-rw-r--r--drivers/media/rc/bpf-lirc.c14
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.c6
-rw-r--r--include/linux/bpf-cgroup.h26
-rw-r--r--include/linux/bpf.h8
-rw-r--r--include/linux/bpf_lirc.h5
-rw-r--r--include/linux/filter.h56
-rw-r--r--include/uapi/linux/bpf.h28
-rw-r--r--kernel/bpf/cgroup.c54
-rw-r--r--kernel/bpf/core.c30
-rw-r--r--kernel/bpf/sockmap.c254
-rw-r--r--kernel/bpf/syscall.c99
-rw-r--r--lib/test_bpf.c20
-rw-r--r--net/core/filter.c86
-rw-r--r--samples/bpf/xdp_fwd_kern.c8
-rw-r--r--tools/bpf/bpftool/prog.c12
-rw-r--r--tools/testing/selftests/bpf/config1
-rwxr-xr-xtools/testing/selftests/bpf/test_kmod.sh9
-rwxr-xr-xtools/testing/selftests/bpf/test_lirc_mode2.sh9
-rwxr-xr-xtools/testing/selftests/bpf/test_lwt_seg6local.sh9
-rw-r--r--tools/testing/selftests/bpf/test_sockmap.c6
22 files changed, 449 insertions, 294 deletions
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 6e8b71613039..f6a62ae44a65 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1844,7 +1844,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
/* there are 2 passes here */
bpf_jit_dump(prog->len, image_size, 2, ctx.target);
- set_memory_ro((unsigned long)header, header->pages);
+ bpf_jit_binary_lock_ro(header);
prog->bpf_func = (void *)ctx.target;
prog->jited = 1;
prog->jited_len = image_size;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index d2db8acb1a55..5f0234ec8038 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1286,6 +1286,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
goto free_addrs;
}
if (bpf_jit_prog(&jit, fp)) {
+ bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
}
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 40826bba06b6..fcfab6635f9c 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -207,29 +207,19 @@ void lirc_bpf_free(struct rc_dev *rcdev)
bpf_prog_array_free(rcdev->raw->progs);
}
-int lirc_prog_attach(const union bpf_attr *attr)
+int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
- struct bpf_prog *prog;
struct rc_dev *rcdev;
int ret;
if (attr->attach_flags)
return -EINVAL;
- prog = bpf_prog_get_type(attr->attach_bpf_fd,
- BPF_PROG_TYPE_LIRC_MODE2);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
rcdev = rc_dev_get_from_fd(attr->target_fd);
- if (IS_ERR(rcdev)) {
- bpf_prog_put(prog);
+ if (IS_ERR(rcdev))
return PTR_ERR(rcdev);
- }
ret = lirc_bpf_attach(rcdev, prog);
- if (ret)
- bpf_prog_put(prog);
put_device(&rcdev->dev);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 6b15e3b11956..40216d56dddc 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -81,10 +81,10 @@ nfp_bpf_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
ret = nfp_net_bpf_offload(nn, prog, running, extack);
/* Stop offload if replace not possible */
- if (ret && prog)
- nfp_bpf_xdp_offload(app, nn, NULL, extack);
+ if (ret)
+ return ret;
- nn->dp.bpf_offload_xdp = prog && !ret;
+ nn->dp.bpf_offload_xdp = !!prog;
return ret;
}
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 975fb4cf1bb7..79795c5fa7c3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
\
__ret; \
})
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype, struct bpf_prog *prog);
+int cgroup_bpf_prog_detach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype);
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
#else
+struct bpf_prog;
struct cgroup_bpf {};
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype)
+{
+ return -EINVAL;
+}
+
+static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return -EINVAL;
+}
+
#define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7df32a3200f7..8827e797ff97 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -696,6 +696,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+ struct bpf_prog *prog);
#else
static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
{
@@ -714,6 +716,12 @@ static inline int sock_map_prog(struct bpf_map *map,
{
return -EOPNOTSUPP;
}
+
+static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
#endif
#if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h
index 5f8a4283092d..9d9ff755ec29 100644
--- a/include/linux/bpf_lirc.h
+++ b/include/linux/bpf_lirc.h
@@ -5,11 +5,12 @@
#include <uapi/linux/bpf.h>
#ifdef CONFIG_BPF_LIRC_MODE2
-int lirc_prog_attach(const union bpf_attr *attr);
+int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int lirc_prog_detach(const union bpf_attr *attr);
int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
#else
-static inline int lirc_prog_attach(const union bpf_attr *attr)
+static inline int lirc_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
{
return -EINVAL;
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 20f2659dd829..300baad62c88 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -470,9 +470,7 @@ struct sock_fprog_kern {
};
struct bpf_binary_header {
- u16 pages;
- u16 locked:1;
-
+ u32 pages;
/* Some arches need word alignment for their instructions */
u8 image[] __aligned(4);
};
@@ -481,7 +479,7 @@ struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
- locked:1, /* Program image locked? */
+ undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
- fp->locked = 1;
- if (set_memory_ro((unsigned long)fp, fp->pages))
- fp->locked = 0;
-#endif
+ fp->undo_set_mem = 1;
+ set_memory_ro((unsigned long)fp, fp->pages);
}
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
- if (fp->locked) {
- WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
- /* In case set_memory_rw() fails, we want to be the first
- * to crash here instead of some random place later on.
- */
- fp->locked = 0;
- }
-#endif
+ if (fp->undo_set_mem)
+ set_memory_rw((unsigned long)fp, fp->pages);
}
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
- hdr->locked = 1;
- if (set_memory_ro((unsigned long)hdr, hdr->pages))
- hdr->locked = 0;
-#endif
+ set_memory_ro((unsigned long)hdr, hdr->pages);
}
static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
- if (hdr->locked) {
- WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
- /* In case set_memory_rw() fails, we want to be the first
- * to crash here instead of some random place later on.
- */
- hdr->locked = 0;
- }
-#endif
+ set_memory_rw((unsigned long)hdr, hdr->pages);
}
static inline struct bpf_binary_header *
@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
return (void *)addr;
}
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
-{
- if (!fp->locked)
- return -ENOLCK;
- if (fp->jited) {
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
-
- if (!hdr->locked)
- return -ENOLCK;
- }
-
- return 0;
-}
-#endif
-
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6a40d7..b7db3261c62d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1857,7 +1857,8 @@ union bpf_attr {
* is resolved), the nexthop address is returned in ipv4_dst
* or ipv6_dst based on family, smac is set to mac address of
* egress device, dmac is set to nexthop mac address, rt_metric
- * is set to metric from route (IPv4/IPv6 only).
+ * is set to metric from route (IPv4/IPv6 only), and ifindex
+ * is set to the device index of the nexthop from the FIB lookup.
*
* *plen* argument is the size of the passed in struct.
* *flags* argument can be a combination of one or more of the
@@ -1873,9 +1874,10 @@ union bpf_attr {
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
* Return
- * Egress device index on success, 0 if packet needs to continue
- * up the stack for further processing or a negative error in case
- * of failure.
+ * * < 0 if any input argument is invalid
+ * * 0 on success (packet is forwarded, nexthop neighbor exists)
+ * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ * * packet is not forwarded or needs assist from full stack
*
* int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
* Description
@@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args {
#define BPF_FIB_LOOKUP_DIRECT BIT(0)
#define BPF_FIB_LOOKUP_OUTPUT BIT(1)
+enum {
+ BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */
+ BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */
+ BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */
+ BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */
+ BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */
+ BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+ BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */
+ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */
+ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */
+};
+
struct bpf_fib_lookup {
/* input: network family for lookup (AF_INET, AF_INET6)
* output: network family of egress nexthop
@@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
/* total length of packet from network header - used for MTU check */
__u16 tot_len;
- __u32 ifindex; /* L3 device index for lookup */
+
+ /* input: L3 device index for lookup
+ * output: device index from FIB lookup
+ */
+ __u32 ifindex;
union {
/* inputs to lookup */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f7c00bd6f8e4..3d83ee7df381 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype, struct bpf_prog *prog)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
+ cgroup_put(cgrp);
+ return ret;
+}
+
+int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+
+ cgroup_put(cgrp);
+ return ret;
+}
+
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+
+ cgroup_put(cgrp);
+ return ret;
+}
+
/**
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_fill_ill_insns(hdr, size);
hdr->pages = size / PAGE_SIZE;
- hdr->locked = 0;
-
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return 0;
}
-static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
-{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
- int i, err;
-
- for (i = 0; i < fp->aux->func_cnt; i++) {
- err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
- if (err)
- return err;
- }
-
- return bpf_prog_check_pages_ro_single(fp);
-#endif
- return 0;
-}
-
static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
* all eBPF JITs might immediately support all features.
*/
*err = bpf_check_tail_call(fp);
- if (*err)
- return fp;
-
- /* Checkpoint: at this point onwards any cBPF -> eBPF or
- * native eBPF program is read-only. If we failed to change
- * the page attributes (e.g. allocation failure from
- * splitting large pages), then reject the whole program
- * in order to guarantee not ending up with any W+X pages
- * from BPF side in kernel.
- */
- *err = bpf_prog_check_pages_ro_locked(fp);
+
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 52a91d816c0e..cf7b6a6dbd1f 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -72,6 +72,7 @@ struct bpf_htab {
u32 n_buckets;
u32 elem_size;
struct bpf_sock_progs progs;
+ struct rcu_head rcu;
};
struct htab_elem {
@@ -89,8 +90,8 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
struct sock **entry;
- struct htab_elem *hash_link;
- struct bpf_htab *htab;
+ struct htab_elem __rcu *hash_link;
+ struct bpf_htab __rcu *htab;
};
struct smap_psock {
@@ -120,6 +121,7 @@ struct smap_psock {
struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict;
struct list_head maps;
+ spinlock_t maps_lock;
/* Back reference used when sock callback trigger sockmap operations */
struct sock *sock;
@@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
+static void bpf_tcp_close(struct sock *sk, long timeout);
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
{
@@ -161,7 +164,42 @@ out:
return !empty;
}
-static struct proto tcp_bpf_proto;
+enum {
+ SOCKMAP_IPV4,
+ SOCKMAP_IPV6,
+ SOCKMAP_NUM_PROTS,
+};
+
+enum {
+ SOCKMAP_BASE,
+ SOCKMAP_TX,
+ SOCKMAP_NUM_CONFIGS,
+};
+
+static struct proto *saved_tcpv6_prot __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
+static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
+ struct proto *base)
+{
+ prot[SOCKMAP_BASE] = *base;
+ prot[SOCKMAP_BASE].close = bpf_tcp_close;
+ prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
+ prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
+
+ prot[SOCKMAP_TX] = prot[SOCKMAP_BASE];
+ prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg;
+ prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage;
+}
+
+static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
+ int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
+
+ sk->sk_prot = &bpf_tcp_prots[family][conf];
+}
+
static int bpf_tcp_init(struct sock *sk)
{
struct smap_psock *psock;
@@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk)
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
- if (psock->bpf_tx_msg) {
- tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
- tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
- tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
- tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
+ /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(sk->sk_prot != saved_tcpv6_prot)) {
+ build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
+ smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
}
-
- sk->sk_prot = &tcp_bpf_proto;
+ update_sk_prot(sk, psock);
rcu_read_unlock();
return 0;
}
@@ -219,16 +260,54 @@ out:
rcu_read_unlock();
}
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
atomic_dec(&htab->count);
kfree_rcu(l, rcu);
}
+static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
+ struct smap_psock *psock)
+{
+ struct smap_psock_map_entry *e;
+
+ spin_lock_bh(&psock->maps_lock);
+ e = list_first_entry_or_null(&psock->maps,
+ struct smap_psock_map_entry,
+ list);
+ if (e)
+ list_del(&e->list);
+ spin_unlock_bh(&psock->maps_lock);
+ return e;
+}
+
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
- struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
struct sock *osk;
@@ -247,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
*/
close_fun = psock->save_close;
- write_lock_bh(&sk->sk_callback_lock);
if (psock->cork) {
free_start_sg(psock->sock, psock->cork);
kfree(psock->cork);
@@ -260,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(md);
}
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ e = psock_map_pop(sk, psock);
+ while (e) {
if (e->entry) {
osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) {
- list_del(&e->list);
smap_release_sock(psock, sk);
}
} else {
- hlist_del_rcu(&e->hash_link->hash_node);
- smap_release_sock(psock, e->hash_link->sk);
- free_htab_elem(e->htab, e->hash_link);
+ struct htab_elem *link = rcu_dereference(e->hash_link);
+ struct bpf_htab *htab = rcu_dereference(e->htab);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ struct bucket *b;
+
+ b = __select_bucket(htab, link->hash);
+ head = &b->head;
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head,
+ link->hash, link->key,
+ htab->map.key_size);
+ /* If another thread deleted this object skip deletion.
+ * The refcnt on psock may or may not be zero.
+ */
+ if (l) {
+ hlist_del_rcu(&link->hash_node);
+ smap_release_sock(psock, link->sk);
+ free_htab_elem(htab, link);
+ }
+ raw_spin_unlock_bh(&b->lock);
}
+ e = psock_map_pop(sk, psock);
}
- write_unlock_bh(&sk->sk_callback_lock);
rcu_read_unlock();
close_fun(sk, timeout);
}
@@ -1111,8 +1207,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
static int bpf_tcp_ulp_register(void)
{
- tcp_bpf_proto = tcp_prot;
- tcp_bpf_proto.close = bpf_tcp_close;
+ build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
/* Once BPF TX ULP is registered it is never unregistered. It
* will be in the ULP list for the lifetime of the system. Doing
* duplicate registers is not a problem.
@@ -1357,7 +1452,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{
if (refcount_dec_and_test(&psock->refcnt)) {
tcp_cleanup_ulp(sock);
+ write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
clear_bit(SMAP_TX_RUNNING, &psock->state);
rcu_assign_sk_user_data(sock, NULL);
call_rcu_sched(&psock->rcu, smap_destroy_psock);
@@ -1508,6 +1605,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node)
INIT_LIST_HEAD(&psock->maps);
INIT_LIST_HEAD(&psock->ingress);
refcount_set(&psock->refcnt, 1);
+ spin_lock_init(&psock->maps_lock);
rcu_assign_sk_user_data(sock, psock);
sock_hold(sock);
@@ -1564,18 +1662,32 @@ free_stab:
return ERR_PTR(err);
}
-static void smap_list_remove(struct smap_psock *psock,
- struct sock **entry,
- struct htab_elem *hash_link)
+static void smap_list_map_remove(struct smap_psock *psock,
+ struct sock **entry)
{
struct smap_psock_map_entry *e, *tmp;
+ spin_lock_bh(&psock->maps_lock);
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry || e->hash_link == hash_link) {
+ if (e->entry == entry)
list_del(&e->list);
- break;
- }
}
+ spin_unlock_bh(&psock->maps_lock);
+}
+
+static void smap_list_hash_remove(struct smap_psock *psock,
+ struct htab_elem *hash_link)
+{
+ struct smap_psock_map_entry *e, *tmp;
+
+ spin_lock_bh(&psock->maps_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ struct htab_elem *c = rcu_dereference(e->hash_link);
+
+ if (c == hash_link)
+ list_del(&e->list);
+ }
+ spin_unlock_bh(&psock->maps_lock);
}
static void sock_map_free(struct bpf_map *map)
@@ -1601,7 +1713,6 @@ static void sock_map_free(struct bpf_map *map)
if (!sock)
continue;
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens
@@ -1609,10 +1720,9 @@ static void sock_map_free(struct bpf_map *map)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, &stab->sock_map[i], NULL);
+ smap_list_map_remove(psock, &stab->sock_map[i]);
smap_release_sock(psock, sock);
}
- write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
@@ -1661,17 +1771,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (!sock)
return -EINVAL;
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
if (!psock)
goto out;
if (psock->bpf_parse)
smap_stop_sock(psock, sock);
- smap_list_remove(psock, &stab->sock_map[k], NULL);
+ smap_list_map_remove(psock, &stab->sock_map[k]);
smap_release_sock(psock, sock);
out:
- write_unlock_bh(&sock->sk_callback_lock);
return 0;
}
@@ -1752,7 +1860,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
}
}
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* 2. Do not allow inheriting programs if psock exists and has
@@ -1809,7 +1916,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
if (err)
goto out_free;
smap_init_progs(psock, verdict, parse);
+ write_lock_bh(&sock->sk_callback_lock);
smap_start_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
}
/* 4. Place psock in sockmap for use and stop any programs on
@@ -1819,9 +1928,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
*/
if (map_link) {
e->entry = map_link;
+ spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps);
+ spin_unlock_bh(&psock->maps_lock);
}
- write_unlock_bh(&sock->sk_callback_lock);
return err;
out_free:
smap_release_sock(psock, sock);
@@ -1832,7 +1942,6 @@ out_progs:
}
if (tx_msg)
bpf_prog_put(tx_msg);
- write_unlock_bh(&sock->sk_callback_lock);
kfree(e);
return err;
}
@@ -1869,10 +1978,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
if (osock) {
struct smap_psock *opsock = smap_psock_sk(osock);
- write_lock_bh(&osock->sk_callback_lock);
- smap_list_remove(opsock, &stab->sock_map[i], NULL);
+ smap_list_map_remove(opsock, &stab->sock_map[i]);
smap_release_sock(opsock, osock);
- write_unlock_bh(&osock->sk_callback_lock);
}
out:
return err;
@@ -1915,6 +2022,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
return 0;
}
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+ struct bpf_prog *prog)
+{
+ int ufd = attr->target_fd;
+ struct bpf_map *map;
+ struct fd f;
+ int err;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = sock_map_prog(map, prog, attr->attach_type);
+ fdput(f);
+ return err;
+}
+
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
return NULL;
@@ -2043,14 +2168,13 @@ free_htab:
return ERR_PTR(err);
}
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+static void __bpf_htab_free(struct rcu_head *rcu)
{
- return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
+ struct bpf_htab *htab;
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &__select_bucket(htab, hash)->head;
+ htab = container_of(rcu, struct bpf_htab, rcu);
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
}
static void sock_hash_free(struct bpf_map *map)
@@ -2069,16 +2193,18 @@ static void sock_hash_free(struct bpf_map *map)
*/
rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
+ struct bucket *b = __select_bucket(htab, i);
+ struct hlist_head *head;
struct hlist_node *n;
struct htab_elem *l;
+ raw_spin_lock_bh(&b->lock);
+ head = &b->head;
hlist_for_each_entry_safe(l, n, head, hash_node) {
struct sock *sock = l->sk;
struct smap_psock *psock;
hlist_del_rcu(&l->hash_node);
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get
* the sk_callback_lock before this case but after xchg
@@ -2086,16 +2212,15 @@ static void sock_hash_free(struct bpf_map *map)
* (psock) to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, NULL, l);
+ smap_list_hash_remove(psock, l);
smap_release_sock(psock, sock);
}
- write_unlock_bh(&sock->sk_callback_lock);
- kfree(l);
+ free_htab_elem(htab, l);
}
+ raw_spin_unlock_bh(&b->lock);
}
rcu_read_unlock();
- bpf_map_area_free(htab->buckets);
- kfree(htab);
+ call_rcu(&htab->rcu, __bpf_htab_free);
}
static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
@@ -2122,19 +2247,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
return l_new;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
- u32 hash, void *key, u32 key_size)
-{
- struct htab_elem *l;
-
- hlist_for_each_entry_rcu(l, head, hash_node) {
- if (l->hash == hash && !memcmp(&l->key, key, key_size))
- return l;
- }
-
- return NULL;
-}
-
static inline u32 htab_map_hash(const void *key, u32 key_len)
{
return jhash(key, key_len, 0);
@@ -2254,9 +2366,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto bucket_err;
}
- e->hash_link = l_new;
- e->htab = container_of(map, struct bpf_htab, map);
+ rcu_assign_pointer(e->hash_link, l_new);
+ rcu_assign_pointer(e->htab,
+ container_of(map, struct bpf_htab, map));
+ spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps);
+ spin_unlock_bh(&psock->maps_lock);
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
@@ -2266,7 +2381,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
psock = smap_psock_sk(l_old->sk);
hlist_del_rcu(&l_old->hash_node);
- smap_list_remove(psock, NULL, l_old);
+ smap_list_hash_remove(psock, l_old);
smap_release_sock(psock, l_old->sk);
free_htab_elem(htab, l_old);
}
@@ -2326,7 +2441,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
struct smap_psock *psock;
hlist_del_rcu(&l->hash_node);
- write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens
@@ -2334,10 +2448,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, NULL, l);
+ smap_list_hash_remove(psock, l);
smap_release_sock(psock, sock);
}
- write_unlock_bh(&sock->sk_callback_lock);
free_htab_elem(htab, l);
ret = 0;
}
@@ -2383,6 +2496,7 @@ const struct bpf_map_ops sock_hash_ops = {
.map_get_next_key = sock_hash_get_next_key,
.map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem,
+ .map_release_uref = sock_map_release,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35dc466641f2..d10ecd78105f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1483,8 +1483,6 @@ out_free_tp:
return err;
}
-#ifdef CONFIG_CGROUP_BPF
-
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
enum bpf_attach_type attach_type)
{
@@ -1499,40 +1497,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
-static int sockmap_get_from_fd(const union bpf_attr *attr,
- int type, bool attach)
-{
- struct bpf_prog *prog = NULL;
- int ufd = attr->target_fd;
- struct bpf_map *map;
- struct fd f;
- int err;
-
- f = fdget(ufd);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return PTR_ERR(map);
-
- if (attach) {
- prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
- if (IS_ERR(prog)) {
- fdput(f);
- return PTR_ERR(prog);
- }
- }
-
- err = sock_map_prog(map, prog, attr->attach_type);
- if (err) {
- fdput(f);
- if (prog)
- bpf_prog_put(prog);
- return err;
- }
-
- fdput(f);
- return 0;
-}
-
#define BPF_F_ATTACH_MASK \
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
@@ -1540,7 +1504,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
- struct cgroup *cgrp;
int ret;
if (!capable(CAP_NET_ADMIN))
@@ -1577,12 +1540,15 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_MSG_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
+ ptype = BPF_PROG_TYPE_SK_MSG;
+ break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+ ptype = BPF_PROG_TYPE_SK_SKB;
+ break;
case BPF_LIRC_MODE2:
- return lirc_prog_attach(attr);
+ ptype = BPF_PROG_TYPE_LIRC_MODE2;
+ break;
default:
return -EINVAL;
}
@@ -1596,18 +1562,20 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return -EINVAL;
}
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp)) {
- bpf_prog_put(prog);
- return PTR_ERR(cgrp);
+ switch (ptype) {
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ ret = sockmap_get_from_fd(attr, ptype, prog);
+ break;
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ ret = lirc_prog_attach(attr, prog);
+ break;
+ default:
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
}
- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
- attr->attach_flags);
if (ret)
bpf_prog_put(prog);
- cgroup_put(cgrp);
-
return ret;
}
@@ -1616,9 +1584,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
- struct bpf_prog *prog;
- struct cgroup *cgrp;
- int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -1651,29 +1616,17 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_MSG_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
case BPF_LIRC_MODE2:
return lirc_prog_detach(attr);
default:
return -EINVAL;
}
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
- if (IS_ERR(prog))
- prog = NULL;
-
- ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
- if (prog)
- bpf_prog_put(prog);
- cgroup_put(cgrp);
- return ret;
+ return cgroup_bpf_prog_detach(attr, ptype);
}
#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
@@ -1681,9 +1634,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- struct cgroup *cgrp;
- int ret;
-
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
@@ -1711,14 +1661,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
default:
return -EINVAL;
}
- cgrp = cgroup_get_from_fd(attr->query.target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
- ret = cgroup_bpf_query(cgrp, attr, uattr);
- cgroup_put(cgrp);
- return ret;
+
+ return cgroup_bpf_prog_query(attr, uattr);
}
-#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -2365,7 +2310,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
-#ifdef CONFIG_CGROUP_BPF
case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr);
break;
@@ -2375,7 +2319,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_QUERY:
err = bpf_prog_query(&attr, uattr);
break;
-#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 60aedc879361..08d3d59dca17 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5282,21 +5282,31 @@ static struct bpf_test tests[] = {
{ /* Mainly checking JIT here. */
"BPF_MAXINSNS: Ctx heavy transformations",
{ },
+#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
+ CLASSIC | FLAG_EXPECTED_FAIL,
+#else
CLASSIC,
+#endif
{ },
{
{ 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) },
{ 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }
},
.fill_helper = bpf_fill_maxinsns6,
+ .expected_errcode = -ENOTSUPP,
},
{ /* Mainly checking JIT here. */
"BPF_MAXINSNS: Call heavy transformations",
{ },
+#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
+ CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
+#else
CLASSIC | FLAG_NO_DATA,
+#endif
{ },
{ { 1, 0 }, { 10, 0 } },
.fill_helper = bpf_fill_maxinsns7,
+ .expected_errcode = -ENOTSUPP,
},
{ /* Mainly checking JIT here. */
"BPF_MAXINSNS: Jump heavy test",
@@ -5347,18 +5357,28 @@ static struct bpf_test tests[] = {
{
"BPF_MAXINSNS: exec all MSH",
{ },
+#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
+ CLASSIC | FLAG_EXPECTED_FAIL,
+#else
CLASSIC,
+#endif
{ 0xfa, 0xfb, 0xfc, 0xfd, },
{ { 4, 0xababab83 } },
.fill_helper = bpf_fill_maxinsns13,
+ .expected_errcode = -ENOTSUPP,
},
{
"BPF_MAXINSNS: ld_abs+get_processor_id",
{ },
+#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
+ CLASSIC | FLAG_EXPECTED_FAIL,
+#else
CLASSIC,
+#endif
{ },
{ { 1, 0xbee } },
.fill_helper = bpf_fill_ld_abs_get_processor_id,
+ .expected_errcode = -ENOTSUPP,
},
/*
* LD_IND / LD_ABS on fragmented SKBs
diff --git a/net/core/filter.c b/net/core/filter.c
index e7f12e9f598c..0ca6907d7efe 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
+ params->ifindex = dev->ifindex;
- return dev->ifindex;
+ return 0;
}
#endif
@@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
/* verify forwarding is enabled on this interface */
in_dev = __in_dev_get_rcu(dev);
if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
- return 0;
+ return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
fl4.flowi4_iif = 1;
@@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
tb = fib_get_table(net, tbid);
if (unlikely(!tb))
- return 0;
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
} else {
@@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
}
- if (err || res.type != RTN_UNICAST)
- return 0;
+ if (err) {
+ /* map fib lookup errors to RTN_ type */
+ if (err == -EINVAL)
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ if (err == -EHOSTUNREACH)
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ if (err == -EACCES)
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
+
+ if (res.type != RTN_UNICAST)
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
if (res.fi->fib_nhs > 1)
fib_select_path(net, &res, &fl4, NULL);
@@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) {
mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
if (params->tot_len > mtu)
- return 0;
+ return BPF_FIB_LKUP_RET_FRAG_NEEDED;
}
nh = &res.fi->fib_nh[res.nh_sel];
/* do not handle lwt encaps right now */
if (nh->nh_lwtstate)
- return 0;
+ return BPF_FIB_LKUP_RET_UNSUPP_LWT;
dev = nh->nh_dev;
- if (unlikely(!dev))
- return 0;
-
if (nh->nh_gw)
params->ipv4_dst = nh->nh_gw;
@@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
* rcu_read_lock_bh is not needed here
*/
neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
- if (neigh)
- return bpf_fib_set_fwd_params(params, neigh, dev);
+ if (!neigh)
+ return BPF_FIB_LKUP_RET_NO_NEIGH;
- return 0;
+ return bpf_fib_set_fwd_params(params, neigh, dev);
}
#endif
@@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
/* link local addresses are never forwarded */
if (rt6_need_strict(dst) || rt6_need_strict(src))
- return 0;
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
dev = dev_get_by_index_rcu(net, params->ifindex);
if (unlikely(!dev))
@@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
idev = __in6_dev_get_safely(dev);
if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
- return 0;
+ return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
fl6.flowi6_iif = 1;
@@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
tb = ipv6_stub->fib6_get_table(net, tbid);
if (unlikely(!tb))
- return 0;
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
} else {
@@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
}
if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
- return 0;
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+ if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
+ switch (f6i->fib6_type) {
+ case RTN_BLACKHOLE:
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ case RTN_UNREACHABLE:
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ case RTN_PROHIBIT:
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+ default:
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
+ }
- if (unlikely(f6i->fib6_flags & RTF_REJECT ||
- f6i->fib6_type != RTN_UNICAST))
- return 0;
+ if (f6i->fib6_type != RTN_UNICAST)
+ return BPF_FIB_LKUP_RET_NOT_FWDED;
if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
@@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) {
mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
if (params->tot_len > mtu)
- return 0;
+ return BPF_FIB_LKUP_RET_FRAG_NEEDED;
}
if (f6i->fib6_nh.nh_lwtstate)
- return 0;
+ return BPF_FIB_LKUP_RET_UNSUPP_LWT;
if (f6i->fib6_flags & RTF_GATEWAY)
*dst = f6i->fib6_nh.nh_gw;
@@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
*/
neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
ndisc_hashfn, dst, dev);
- if (neigh)
- return bpf_fib_set_fwd_params(params, neigh, dev);
+ if (!neigh)
+ return BPF_FIB_LKUP_RET_NO_NEIGH;
- return 0;
+ return bpf_fib_set_fwd_params(params, neigh, dev);
}
#endif
@@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
struct net *net = dev_net(skb->dev);
- int index = -EAFNOSUPPORT;
+ int rc = -EAFNOSUPPORT;
if (plen < sizeof(*params))
return -EINVAL;
@@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
case AF_INET:
- index = bpf_ipv4_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv4_fib_lookup(net, params, flags, false);
break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- index = bpf_ipv6_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv6_fib_lookup(net, params, flags, false);
break;
#endif
}
- if (index > 0) {
+ if (!rc) {
struct net_device *dev;
- dev = dev_get_by_index_rcu(net, index);
+ dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb))
- index = 0;
+ rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
}
- return index;
+ return rc;
}
static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 6673cdb9f55c..a7e94e7ff87d 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
struct ethhdr *eth = data;
struct ipv6hdr *ip6h;
struct iphdr *iph;
- int out_index;
u16 h_proto;
u64 nh_off;
+ int rc;
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
@@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
fib_params.ifindex = ctx->ingress_ifindex;
- out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+ rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
/* verify egress index has xdp support
* TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
@@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
* NOTE: without verification that egress index supports XDP
* forwarding packets are dropped.
*/
- if (out_index > 0) {
+ if (rc == 0) {
if (h_proto == htons(ETH_P_IP))
ip_decrease_ttl(iph);
else if (h_proto == htons(ETH_P_IPV6))
@@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
- return bpf_redirect_map(&tx_port, out_index, 0);
+ return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
}
return XDP_PASS;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 05f42a46d6ed..959aa53ab678 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -694,15 +694,19 @@ static int do_load(int argc, char **argv)
return -1;
}
- if (do_pin_fd(prog_fd, argv[1])) {
- p_err("failed to pin program");
- return -1;
- }
+ if (do_pin_fd(prog_fd, argv[1]))
+ goto err_close_obj;
if (json_output)
jsonw_null(json_wtr);
+ bpf_object__close(obj);
+
return 0;
+
+err_close_obj:
+ bpf_object__close(obj);
+ return -1;
}
static int do_help(int argc, char **argv)
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 7eb613ffef55..b4994a94968b 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -6,6 +6,7 @@ CONFIG_TEST_BPF=m
CONFIG_CGROUP_BPF=y
CONFIG_NETDEVSIM=m
CONFIG_NET_CLS_ACT=y
+CONFIG_NET_SCHED=y
CONFIG_NET_SCH_INGRESS=y
CONFIG_NET_IPIP=y
CONFIG_IPV6=y
diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh
index 35669ccd4d23..9df0d2ac45f8 100755
--- a/tools/testing/selftests/bpf/test_kmod.sh
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -1,6 +1,15 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ "$(id -u)" != "0" ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
SRC_TREE=../../../../
test_run()
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2.sh b/tools/testing/selftests/bpf/test_lirc_mode2.sh
index ce2e15e4f976..677686198df3 100755
--- a/tools/testing/selftests/bpf/test_lirc_mode2.sh
+++ b/tools/testing/selftests/bpf/test_lirc_mode2.sh
@@ -1,6 +1,15 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
GREEN='\033[0;92m'
RED='\033[0;31m'
NC='\033[0m' # No Color
diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
index 1c77994b5e71..270fa8f49573 100755
--- a/tools/testing/selftests/bpf/test_lwt_seg6local.sh
+++ b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
@@ -21,6 +21,15 @@
# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this
# datagram can be read on NS6 when binding to fb00::6.
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
TMP_FILE="/tmp/selftest_lwt_seg6local.txt"
cleanup()
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 05c8cb71724a..9e78df207919 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1413,18 +1413,12 @@ out:
int main(int argc, char **argv)
{
- struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
int iov_count = 1, length = 1024, rate = 1;
struct sockmap_options options = {0};
int opt, longindex, err, cg_fd = 0;
char *bpf_file = BPF_SOCKMAP_FILENAME;
int test = PING_PONG;
- if (setrlimit(RLIMIT_MEMLOCK, &r)) {
- perror("setrlimit(RLIMIT_MEMLOCK)");
- return 1;
- }
-
if (argc < 2)
return test_suite();