diff options
author | David S. Miller <davem@davemloft.net> | 2018-10-15 23:21:07 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-10-15 23:21:07 -0700 |
commit | e85679511e48168b0f066b6ae585556b5e0d8f5b (patch) | |
tree | f0b5fc85876cef035e36b76c43f3471fff40db53 | |
parent | c45d7150656fc33181af5806c94bfe0e8f90c1a6 (diff) | |
parent | 0b592b5a01bef5416472ec610d3191e019c144a5 (diff) | |
download | linux-e85679511e48168b0f066b6ae585556b5e0d8f5b.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
pull-request: bpf-next 2018-10-16
The following pull-request contains BPF updates for your *net-next* tree.
The main changes are:
1) Convert BPF sockmap and kTLS to both use a new sk_msg API and enable
sk_msg BPF integration for the latter, from Daniel and John.
2) Enable BPF syscall side to indicate for maps that they do not support
a map lookup operation as opposed to just missing key, from Prashant.
3) Add bpftool map create command which after map creation pins the
map into bpf fs for further processing, from Jakub.
4) Add bpftool support for attaching programs to maps allowing sock_map
and sock_hash to be used from bpftool, from John.
5) Improve syscall BPF map update/delete path for map-in-map types to
wait a RCU grace period for pending references to complete, from Daniel.
6) Couple of follow-up fixes for the BPF socket lookup to get it
enabled also when IPv6 is compiled as a module, from Joe.
7) Fix a generic-XDP bug to handle the case when the Ethernet header
was mangled and thus update skb's protocol and data, from Jesper.
8) Add a missing BTF header length check between header copies from
user space, from Wenwen.
9) Minor fixups in libbpf to use __u32 instead u32 types and include
proper perf_event.h uapi header instead of perf internal one, from Yonghong.
10) Allow to pass user-defined flags through EXTRA_CFLAGS and EXTRA_LDFLAGS
to bpftool's build, from Jiri.
11) BPF kselftest tweaks to add LWTUNNEL to config fragment and to install
with_addr.sh script from flow dissector selftest, from Anders.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
54 files changed, 4962 insertions, 3659 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 031127139f3b..7f1399ac028e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8193,6 +8193,16 @@ S: Maintained F: net/l3mdev F: include/net/l3mdev.h +L7 BPF FRAMEWORK +M: John Fastabend <john.fastabend@gmail.com> +M: Daniel Borkmann <daniel@iogearbox.net> +L: netdev@vger.kernel.org +S: Maintained +F: include/linux/skmsg.h +F: net/core/skmsg.c +F: net/core/sock_map.c +F: net/ipv4/tcp_bpf.c + LANTIQ / INTEL Ethernet drivers M: Hauke Mehrtens <hauke@hauke-m.de> L: netdev@vger.kernel.org diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9b558713447f..e60fff48288b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog); +#if defined(CONFIG_BPF_STREAM_PARSER) +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); #else -static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - return NULL; -} - -static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, - void *key) -{ - return NULL; -} - -static inline int sock_map_prog(struct bpf_map *map, - struct bpf_prog *prog, - u32 type) +static inline int sock_map_prog_update(struct bpf_map *map, + struct bpf_prog *prog, u32 which) { return -EOPNOTSUPP; } -static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) +static inline int sock_map_get_from_fd(const union bpf_attr *attr, + struct bpf_prog *prog) { return -EINVAL; } @@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; +extern const struct bpf_func_proto bpf_msg_redirect_map_proto; +extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; +extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5432f4c9f50e..fa48343a5ea1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) +#if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif diff --git a/include/linux/filter.h b/include/linux/filter.h index 6791a0ac0139..5771874bc01e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -520,24 +520,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct sk_msg_buff { - void *data; - void *data_end; - __u32 apply_bytes; - __u32 cork_bytes; - int sg_copybreak; - int sg_start; - int sg_curr; - int sg_end; - struct scatterlist sg_data[MAX_SKB_FRAGS]; - bool sg_copy[MAX_SKB_FRAGS]; - __u32 flags; - struct sock *sk_redir; - struct sock *sk; - struct sk_buff *skb; - struct list_head list; -}; - struct bpf_redirect_info { u32 ifindex; u32 flags; @@ -833,9 +815,6 @@ void xdp_do_flush_map(void); void bpf_warn_invalid_xdp_action(u32 act); -struct sock *do_sk_redirect_map(struct sk_buff *skb); -struct sock *do_msg_redirect_map(struct sk_msg_buff *md); - #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h new file mode 100644 index 000000000000..0b919f0bc6d6 --- /dev/null +++ b/include/linux/skmsg.h @@ -0,0 +1,410 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#ifndef _LINUX_SKMSG_H +#define _LINUX_SKMSG_H + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/scatterlist.h> +#include <linux/skbuff.h> + +#include <net/sock.h> +#include <net/tcp.h> +#include <net/strparser.h> + +#define MAX_MSG_FRAGS MAX_SKB_FRAGS + +enum __sk_action { + __SK_DROP = 0, + __SK_PASS, + __SK_REDIRECT, + __SK_NONE, +}; + +struct sk_msg_sg { + u32 start; + u32 curr; + u32 end; + u32 size; + u32 copybreak; + bool copy[MAX_MSG_FRAGS]; + /* The extra element is used for chaining the front and sections when + * the list becomes partitioned (e.g. end < start). The crypto APIs + * require the chaining. + */ + struct scatterlist data[MAX_MSG_FRAGS + 1]; +}; + +struct sk_msg { + struct sk_msg_sg sg; + void *data; + void *data_end; + u32 apply_bytes; + u32 cork_bytes; + u32 flags; + struct sk_buff *skb; + struct sock *sk_redir; + struct sock *sk; + struct list_head list; +}; + +struct sk_psock_progs { + struct bpf_prog *msg_parser; + struct bpf_prog *skb_parser; + struct bpf_prog *skb_verdict; +}; + +enum sk_psock_state_bits { + SK_PSOCK_TX_ENABLED, +}; + +struct sk_psock_link { + struct list_head list; + struct bpf_map *map; + void *link_raw; +}; + +struct sk_psock_parser { + struct strparser strp; + bool enabled; + void (*saved_data_ready)(struct sock *sk); +}; + +struct sk_psock_work_state { + struct sk_buff *skb; + u32 len; + u32 off; +}; + +struct sk_psock { + struct sock *sk; + struct sock *sk_redir; + u32 apply_bytes; + u32 cork_bytes; + u32 eval; + struct sk_msg *cork; + struct sk_psock_progs progs; + struct sk_psock_parser parser; + struct sk_buff_head ingress_skb; + struct list_head ingress_msg; + unsigned long state; + struct list_head link; + spinlock_t link_lock; + refcount_t refcnt; + void (*saved_unhash)(struct sock *sk); + void (*saved_close)(struct sock *sk, long timeout); + void (*saved_write_space)(struct sock *sk); + struct proto *sk_proto; + struct sk_psock_work_state work_state; + struct work_struct work; + union { + struct rcu_head rcu; + struct work_struct gc; + }; +}; + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce); +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len); +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); +int sk_msg_free(struct sock *sk, struct sk_msg *msg); +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes); +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes); + +static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) +{ + WARN_ON(i == msg->sg.end && bytes); +} + +static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < bytes) + psock->apply_bytes = 0; + else + psock->apply_bytes -= bytes; + } +} + +#define sk_msg_iter_var_prev(var) \ + do { \ + if (var == 0) \ + var = MAX_MSG_FRAGS - 1; \ + else \ + var--; \ + } while (0) + +#define sk_msg_iter_var_next(var) \ + do { \ + var++; \ + if (var == MAX_MSG_FRAGS) \ + var = 0; \ + } while (0) + +#define sk_msg_iter_prev(msg, which) \ + sk_msg_iter_var_prev(msg->sg.which) + +#define sk_msg_iter_next(msg, which) \ + sk_msg_iter_var_next(msg->sg.which) + +static inline void sk_msg_clear_meta(struct sk_msg *msg) +{ + memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); +} + +static inline void sk_msg_init(struct sk_msg *msg) +{ + BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS); + memset(msg, 0, sizeof(*msg)); + sg_init_marker(msg->sg.data, MAX_MSG_FRAGS); +} + +static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, + int which, u32 size) +{ + dst->sg.data[which] = src->sg.data[which]; + dst->sg.data[which].length = size; + src->sg.data[which].length -= size; + src->sg.data[which].offset += size; +} + +static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) +{ + memcpy(dst, src, sizeof(*src)); + sk_msg_init(src); +} + +static inline u32 sk_msg_elem_used(const struct sk_msg *msg) +{ + return msg->sg.end >= msg->sg.start ? + msg->sg.end - msg->sg.start : + msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); +} + +static inline bool sk_msg_full(const struct sk_msg *msg) +{ + return (msg->sg.end == msg->sg.start) && msg->sg.size; +} + +static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) +{ + return &msg->sg.data[which]; +} + +static inline struct page *sk_msg_page(struct sk_msg *msg, int which) +{ + return sg_page(sk_msg_elem(msg, which)); +} + +static inline bool sk_msg_to_ingress(const struct sk_msg *msg) +{ + return msg->flags & BPF_F_INGRESS; +} + +static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) +{ + struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); + + if (msg->sg.copy[msg->sg.start]) { + msg->data = NULL; + msg->data_end = NULL; + } else { + msg->data = sg_virt(sge); + msg->data_end = msg->data + sge->length; + } +} + +static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, + u32 len, u32 offset) +{ + struct scatterlist *sge; + + get_page(page); + sge = sk_msg_elem(msg, msg->sg.end); + sg_set_page(sge, page, len, offset); + sg_unmark_end(sge); + + msg->sg.copy[msg->sg.end] = true; + msg->sg.size += len; + sk_msg_iter_next(msg, end); +} + +static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) +{ + do { + msg->sg.copy[i] = copy_state; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + } while (1); +} + +static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) +{ + sk_msg_sg_copy(msg, start, true); +} + +static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) +{ + sk_msg_sg_copy(msg, start, false); +} + +static inline struct sk_psock *sk_psock(const struct sock *sk) +{ + return rcu_dereference_sk_user_data(sk); +} + +static inline bool sk_has_psock(struct sock *sk) +{ + return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg; +} + +static inline void sk_psock_queue_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + list_add_tail(&msg->list, &psock->ingress_msg); +} + +static inline bool sk_psock_queue_empty(const struct sk_psock *psock) +{ + return psock ? list_empty(&psock->ingress_msg) : true; +} + +static inline void sk_psock_report_error(struct sk_psock *psock, int err) +{ + struct sock *sk = psock->sk; + + sk->sk_err = err; + sk->sk_error_report(sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node); + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg); + +static inline struct sk_psock_link *sk_psock_init_link(void) +{ + return kzalloc(sizeof(struct sk_psock_link), + GFP_ATOMIC | __GFP_NOWARN); +} + +static inline void sk_psock_free_link(struct sk_psock_link *link) +{ + kfree(link); +} + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); +#if defined(CONFIG_BPF_STREAM_PARSER) +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link); +#else +static inline void sk_psock_unlink(struct sock *sk, + struct sk_psock_link *link) +{ +} +#endif + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock); + +static inline void sk_psock_cork_free(struct sk_psock *psock) +{ + if (psock->cork) { + sk_msg_free(psock->sk, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } +} + +static inline void sk_psock_update_proto(struct sock *sk, + struct sk_psock *psock, + struct proto *ops) +{ + psock->saved_unhash = sk->sk_prot->unhash; + psock->saved_close = sk->sk_prot->close; + psock->saved_write_space = sk->sk_write_space; + + psock->sk_proto = sk->sk_prot; + sk->sk_prot = ops; +} + +static inline void sk_psock_restore_proto(struct sock *sk, + struct sk_psock *psock) +{ + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } +} + +static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline struct sk_psock *sk_psock_get(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock && !refcount_inc_not_zero(&psock->refcnt)) + psock = NULL; + rcu_read_unlock(); + return psock; +} + +void sk_psock_stop(struct sock *sk, struct sk_psock *psock); +void sk_psock_destroy(struct rcu_head *rcu); +void sk_psock_drop(struct sock *sk, struct sk_psock *psock); + +static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) +{ + if (refcount_dec_and_test(&psock->refcnt)) + sk_psock_drop(sk, psock); +} + +static inline void psock_set_prog(struct bpf_prog **pprog, + struct bpf_prog *prog) +{ + prog = xchg(pprog, prog); + if (prog) + bpf_prog_put(prog); +} + +static inline void psock_progs_drop(struct sk_psock_progs *progs) +{ + psock_set_prog(&progs->msg_parser, NULL); + psock_set_prog(&progs->skb_parser, NULL); + psock_set_prog(&progs->skb_verdict, NULL); +} + +#endif /* _LINUX_SKMSG_H */ diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 6def0351bcc3..14b789a123e7 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -265,6 +265,11 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, bool force_bind_address_no_port, bool with_lock); + struct sock *(*udp6_lib_lookup)(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *tbl, + struct sk_buff *skb); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/net/sock.h b/include/net/sock.h index cfaf261936c8..2440f8b407eb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2214,10 +2214,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr, unsigned int *sg_size, - int first_coalesce); - /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0d2929223c70..3600ae0f25c3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); } +static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; +} + +static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->bpf.sk_redir; +} + +static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.sk_redir = NULL; +} + #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] @@ -2057,7 +2072,6 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); -int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); @@ -2065,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk); __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +struct sk_msg; +struct sk_psock; + +int tcp_bpf_init(struct sock *sk); +void tcp_bpf_reinit(struct sock *sk); +int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, + int flags); +int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + struct msghdr *msg, int len); + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/net/tls.h b/include/net/tls.h index 5e853835597e..bab5627ff5e3 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -39,6 +39,8 @@ #include <linux/crypto.h> #include <linux/socket.h> #include <linux/tcp.h> +#include <linux/skmsg.h> + #include <net/tcp.h> #include <net/strparser.h> #include <crypto/aead.h> @@ -103,15 +105,13 @@ struct tls_rec { int tx_flags; int inplace_crypto; - /* AAD | sg_plaintext_data | sg_tag */ - struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1]; - /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ - struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1]; + struct sk_msg msg_plaintext; + struct sk_msg msg_encrypted; - unsigned int sg_plaintext_size; - unsigned int sg_encrypted_size; - int sg_plaintext_num_elem; - int sg_encrypted_num_elem; + /* AAD | msg_plaintext.sg.data | sg_tag */ + struct scatterlist sg_aead_in[2]; + /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ + struct scatterlist sg_aead_out[2]; char aad_space[TLS_AAD_SPACE_SIZE]; struct aead_request aead_req; @@ -142,8 +142,7 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); + struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -223,8 +222,8 @@ struct tls_context { unsigned long flags; bool in_tcp_sendpages; + bool pending_open_record_frags; - u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); @@ -272,8 +271,7 @@ void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +bool tls_sw_stream_read(const struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0488b8258321..ff8262626b8f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y) obj-$(CONFIG_BPF_SYSCALL) += xskmap.o endif obj-$(CONFIG_BPF_SYSCALL) += offload.o -ifeq ($(CONFIG_STREAM_PARSER),y) -ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_BPF_SYSCALL) += sockmap.o -endif -endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index dded84cbe814..24583da9ffd1 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map) static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 138f0302692e..378cef70341c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2114,6 +2114,9 @@ static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, hdr = &btf->hdr; + if (hdr->hdr_len != hdr_len) + return -EINVAL; + btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3f5bf1af0826..defcf4df6d91 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index d37a1a0a6e1e..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2629 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include <linux/bpf.h> -#include <net/sock.h> -#include <linux/filter.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/kernel.h> -#include <linux/net.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <net/strparser.h> -#include <net/tcp.h> -#include <linux/ptr_ring.h> -#include <net/inet_common.h> -#include <linux/sched/signal.h> - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; - struct bpf_sock_progs progs; - raw_spinlock_t lock; -}; - -struct bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bucket *buckets; - atomic_t count; - u32 n_buckets; - u32 elem_size; - struct bpf_sock_progs progs; - struct rcu_head rcu; -}; - -struct htab_elem { - struct rcu_head rcu; - struct hlist_node hash_node; - u32 hash; - struct sock *sk; - char key[0]; -}; - -enum smap_psock_state { - SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { - struct list_head list; - struct bpf_map *map; - struct sock **entry; - struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { - struct rcu_head rcu; - refcount_t refcnt; - - /* datapath variables */ - struct sk_buff_head rxqueue; - bool strp_enabled; - - /* datapath error path cache across tx work invocations */ - int save_rem; - int save_off; - struct sk_buff *save_skb; - - /* datapath variables for tx_msg ULP */ - struct sock *sk_redir; - int apply_bytes; - int cork_bytes; - int sg_size; - int eval; - struct sk_msg_buff *cork; - struct list_head ingress; - - struct strparser strp; - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; - struct list_head maps; - spinlock_t maps_lock; - - /* Back reference used when sock callback trigger sockmap operations */ - struct sock *sock; - unsigned long state; - - struct work_struct tx_work; - struct work_struct gc_work; - - struct proto *sk_proto; - void (*save_unhash)(struct sock *sk); - void (*save_close)(struct sock *sk, long timeout); - void (*save_data_ready)(struct sock *sk); - void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -static void bpf_tcp_unhash(struct sock *sk); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ - struct smap_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - empty = list_empty(&psock->ingress); -out: - rcu_read_unlock(); - return !empty; -} - -enum { - SOCKMAP_IPV4, - SOCKMAP_IPV6, - SOCKMAP_NUM_PROTS, -}; - -enum { - SOCKMAP_BASE, - SOCKMAP_TX, - SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], - struct proto *base) -{ - prot[SOCKMAP_BASE] = *base; - prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; - prot[SOCKMAP_BASE].close = bpf_tcp_close; - prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; - prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; - - prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; - prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; - prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; - int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - - sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - if (unlikely(psock->sk_proto)) { - rcu_read_unlock(); - return -EBUSY; - } - - psock->save_unhash = sk->sk_prot->unhash; - psock->save_close = sk->sk_prot->close; - psock->sk_proto = sk->sk_prot; - - /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ - if (sk->sk_family == AF_INET6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - spin_unlock_bh(&tcpv6_prot_lock); - } - update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -out: - rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ - atomic_dec(&htab->count); - kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, - struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - - spin_lock_bh(&psock->maps_lock); - e = list_first_entry_or_null(&psock->maps, - struct smap_psock_map_entry, - list); - if (e) - list_del(&e->list); - spin_unlock_bh(&psock->maps_lock); - return e; -} - -static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - struct sk_msg_buff *md, *mtmp; - struct sock *osk; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - e = psock_map_pop(sk, psock); - while (e) { - if (e->entry) { - struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - - raw_spin_lock_bh(&stab->lock); - osk = *e->entry; - if (osk == sk) { - *e->entry = NULL; - smap_release_sock(psock, sk); - } - raw_spin_unlock_bh(&stab->lock); - } else { - struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - struct bucket *b; - - b = __select_bucket(htab, link->hash); - head = &b->head; - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, - link->hash, link->key, - htab->map.key_size); - /* If another thread deleted this object skip deletion. - * The refcnt on psock may or may not be zero. - */ - if (l && l == link) { - hlist_del_rcu(&link->hash_node); - smap_release_sock(psock, link->sk); - free_htab_elem(htab, link); - } - raw_spin_unlock_bh(&b->lock); - } - kfree(e); - e = psock_map_pop(sk, psock); - } -} - -static void bpf_tcp_unhash(struct sock *sk) -{ - void (*unhash_fun)(struct sock *sk); - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - unhash_fun = psock->save_unhash; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - unhash_fun(sk); -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ - void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - close_fun = psock->save_close; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - close_fun(sk, timeout); -} - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { - .name = "bpf_tcp", - .uid = TCP_ULP_BPF, - .user_visible = false, - .owner = NULL, - .init = bpf_tcp_init, - .release = bpf_tcp_release, -}; - -static int memcopy_from_iter(struct sock *sk, - struct sk_msg_buff *md, - struct iov_iter *from, int bytes) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_curr, rc = -ENOSPC; - - do { - int copy; - char *to; - - if (md->sg_copybreak >= sg[i].length) { - md->sg_copybreak = 0; - - if (++i == MAX_SKB_FRAGS) - i = 0; - - if (i == md->sg_end) - break; - } - - copy = sg[i].length - md->sg_copybreak; - to = sg_virt(&sg[i]) + md->sg_copybreak; - md->sg_copybreak += copy; - - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - rc = copy_from_iter_nocache(to, copy, from); - else - rc = copy_from_iter(to, copy, from); - - if (rc != copy) { - rc = -EFAULT; - goto out; - } - - bytes -= copy; - if (!bytes) - break; - - md->sg_copybreak = 0; - if (++i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -out: - md->sg_curr = i; - return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, - struct sk_msg_buff *md, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sg; - int offset, ret = 0; - struct page *p; - size_t size; - - while (1) { - sg = md->sg_data + md->sg_start; - size = (apply && apply_bytes < sg->length) ? - apply_bytes : sg->length; - offset = sg->offset; - - tcp_rate_check_app_limited(sk); - p = sg_page(sg); -retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); - if (ret != size) { - if (ret > 0) { - if (apply) - apply_bytes -= ret; - - sg->offset += ret; - sg->length -= ret; - size -= ret; - offset += ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - goto retry; - } - - return ret; - } - - if (apply) - apply_bytes -= ret; - sg->offset += ret; - sg->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - - if (!sg->length) { - put_page(p); - md->sg_start++; - if (md->sg_start == MAX_SKB_FRAGS) - md->sg_start = 0; - sg_init_table(sg, 1); - - if (md->sg_start == md->sg_end) - break; - } - - if (apply && !apply_bytes) - break; - } - return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data + md->sg_start; - - if (md->sg_copy[md->sg_start]) { - md->data = md->data_end = 0; - } else { - md->data = sg_virt(sg); - md->data_end = md->data + sg->length; - } -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start; - - do { - int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - if (!bytes) - break; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start, free; - - while (bytes && sg[i].length) { - free = sg[i].length; - if (bytes < free) { - sg[i].length -= bytes; - sg[i].offset += bytes; - if (charge) - sk_mem_uncharge(sk, bytes); - break; - } - - if (charge) - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - bytes -= sg[i].length; - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = start, free = 0; - - while (sg[i].length) { - free += sg[i].length; - if (charge) - sk_mem_uncharge(sk, sg[i].length); - if (!md->skb) - put_page(sg_page(&sg[i])); - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - consume_skb(md->skb); - - return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ - int free = free_sg(sk, md->sg_start, md, charge); - - md->sg_start = md->sg_end; - return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ - return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ - return ((_rc == SK_PASS) ? - (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, - struct smap_psock *psock, - struct sk_msg_buff *md) -{ - struct bpf_prog *prog; - unsigned int rc, _rc; - - preempt_disable(); - rcu_read_lock(); - - /* If the policy was removed mid-send then default to 'accept' */ - prog = READ_ONCE(psock->bpf_tx_msg); - if (unlikely(!prog)) { - _rc = SK_PASS; - goto verdict; - } - - bpf_compute_data_pointers_sg(md); - md->sk = sk; - rc = (*prog->bpf_func)(md, prog->insnsi); - psock->apply_bytes = md->apply_bytes; - - /* Moving return codes from UAPI namespace into internal namespace */ - _rc = bpf_map_msg_verdict(rc, md); - - /* The psock has a refcount on the sock but not on the map and because - * we need to drop rcu read lock here its possible the map could be - * removed between here and when we need it to execute the sock - * redirect. So do the map lookup now for future use. - */ - if (_rc == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = do_msg_redirect_map(md); - if (!psock->sk_redir) { - _rc = __SK_DROP; - goto verdict; - } - sock_hold(psock->sk_redir); - } -verdict: - rcu_read_unlock(); - preempt_enable(); - - return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, - struct smap_psock *psock, - struct sk_msg_buff *md, int flags) -{ - bool apply = apply_bytes; - size_t size, copied = 0; - struct sk_msg_buff *r; - int err = 0, i; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!r)) - return -ENOMEM; - - lock_sock(sk); - r->sg_start = md->sg_start; - i = md->sg_start; - - do { - size = (apply && apply_bytes < md->sg_data[i].length) ? - apply_bytes : md->sg_data[i].length; - - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - err = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - r->sg_data[i] = md->sg_data[i]; - r->sg_data[i].length = size; - md->sg_data[i].length -= size; - md->sg_data[i].offset += size; - copied += size; - - if (md->sg_data[i].length) { - get_page(sg_page(&r->sg_data[i])); - r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; - } else { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - r->sg_end = i; - } - - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != md->sg_end); - - md->sg_start = i; - - if (!err) { - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - } else { - free_start_sg(sk, r, true); - kfree(r); - } - - release_sock(sk); - return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, - struct sk_msg_buff *md, - int flags) -{ - bool ingress = !!(md->flags & BPF_F_INGRESS); - struct smap_psock *psock; - int err = 0; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out_rcu; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto out_rcu; - - rcu_read_unlock(); - - if (ingress) { - err = bpf_tcp_ingress(sk, send, psock, md, flags); - } else { - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); - } - smap_release_sock(psock, sk); - return err; -out_rcu: - rcu_read_unlock(); - return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < i) - psock->apply_bytes = 0; - else - psock->apply_bytes -= i; - } -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, - struct sk_msg_buff *m, - struct sock *sk, - int *copied, int flags) -{ - bool cork = false, enospc = (m->sg_start == m->sg_end); - struct sock *redir; - int err = 0; - int send; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = smap_do_tx_msg(sk, psock, m); - - if (m->cork_bytes && - m->cork_bytes > psock->sg_size && !enospc) { - psock->cork_bytes = m->cork_bytes - psock->sg_size; - if (!psock->cork) { - psock->cork = kcalloc(1, - sizeof(struct sk_msg_buff), - GFP_ATOMIC | __GFP_NOWARN); - - if (!psock->cork) { - err = -ENOMEM; - goto out_err; - } - } - memcpy(psock->cork, m, sizeof(*m)); - goto out_err; - } - - send = psock->sg_size; - if (psock->apply_bytes && psock->apply_bytes < send) - send = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - err = bpf_tcp_push(sk, send, m, flags, true); - if (unlikely(err)) { - *copied -= free_start_sg(sk, m, true); - break; - } - - apply_bytes_dec(psock, send); - psock->sg_size -= send; - break; - case __SK_REDIRECT: - redir = psock->sk_redir; - apply_bytes_dec(psock, send); - - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - - return_mem_sg(sk, send, m); - release_sock(sk); - - err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); - lock_sock(sk); - - if (unlikely(err < 0)) { - int free = free_start_sg(sk, m, false); - - psock->sg_size = 0; - if (!cork) - *copied -= free; - } else { - psock->sg_size -= send; - } - - if (cork) { - free_start_sg(sk, m, true); - psock->sg_size = 0; - kfree(m); - m = NULL; - err = 0; - } - break; - case __SK_DROP: - default: - free_bytes_sg(sk, send, m, true); - apply_bytes_dec(psock, send); - *copied -= send; - psock->sg_size -= send; - err = -EACCES; - break; - } - - if (likely(!err)) { - bpf_md_init(psock); - if (m && - m->sg_data[m->sg_start].page_link && - m->sg_data[m->sg_start].length) - goto more_data; - } - -out_err: - return err; -} - -static int bpf_wait_data(struct sock *sk, - struct smap_psock *psk, int flags, - long timeo, int *err) -{ - int rc; - - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, &timeo, - !list_empty(&psk->ingress) || - !skb_queue_empty(&sk->sk_receive_queue), - &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - - return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct iov_iter *iter = &msg->msg_iter; - struct smap_psock *psock; - int copied = 0; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) - goto out; - rcu_read_unlock(); - - lock_sock(sk); -bytes_ready: - while (copied != len) { - struct scatterlist *sg; - struct sk_msg_buff *md; - int i; - - md = list_first_entry_or_null(&psock->ingress, - struct sk_msg_buff, list); - if (unlikely(!md)) - break; - i = md->sg_start; - do { - struct page *page; - int n, copy; - - sg = &md->sg_data[i]; - copy = sg->length; - page = sg_page(sg); - - if (copied + copy > len) - copy = len - copied; - - n = copy_page_to_iter(page, sg->offset, copy, iter); - if (n != copy) { - md->sg_start = i; - release_sock(sk); - smap_release_sock(psock, sk); - return -EFAULT; - } - - copied += copy; - sg->offset += copy; - sg->length -= copy; - sk_mem_uncharge(sk, copy); - - if (!sg->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!md->skb) - put_page(page); - } - if (copied == len) - break; - } while (i != md->sg_end); - md->sg_start = i; - - if (!sg->length && md->sg_start == md->sg_end) { - list_del(&md->list); - consume_skb(md->skb); - kfree(md); - } - } - - if (!copied) { - long timeo; - int data; - int err = 0; - - timeo = sock_rcvtimeo(sk, nonblock); - data = bpf_wait_data(sk, psock, flags, timeo, &err); - - if (data) { - if (!skb_queue_empty(&sk->sk_receive_queue)) { - release_sock(sk); - smap_release_sock(psock, sk); - copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - return copied; - } - goto bytes_ready; - } - - if (err) - copied = err; - } - - release_sock(sk); - smap_release_sock(psock, sk); - return copied; -out: - rcu_read_unlock(); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - struct sk_msg_buff md = {0}; - unsigned int sg_copy = 0; - struct smap_psock *psock; - int copied = 0, err = 0; - struct scatterlist *sg; - long timeo; - - /* Its possible a sock event or user removed the psock _but_ the ops - * have not been reprogrammed yet so we get here. In this case fallback - * to tcp_sendmsg. Note this only works because we _only_ ever allow - * a single ULP there is no hierarchy here. - */ - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - /* Increment the psock refcnt to ensure its not released while sending a - * message. Required because sk lookup and bpf programs are used in - * separate rcu critical sections. Its OK if we lose the map entry - * but we can't lose the sock reference. - */ - if (!refcount_inc_not_zero(&psock->refcnt)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - sg = md.sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - rcu_read_unlock(); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - while (msg_data_left(msg)) { - struct sk_msg_buff *m = NULL; - bool enospc = false; - int copy; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - m = psock->cork_bytes ? psock->cork : &md; - m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; - err = sk_alloc_sg(sk, copy, m->sg_data, - m->sg_start, &m->sg_end, &sg_copy, - m->sg_end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = sg_copy; - } - - err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); - if (err < 0) { - free_curr_sg(sk, m); - goto out_err; - } - - psock->sg_size += copy; - copied += copy; - sg_copy = 0; - - /* When bytes are being corked skip running BPF program and - * applying verdict unless there is no more buffer space. In - * the ENOSPC case simply run BPF prorgram with currently - * accumulated data. We don't have much choice at this point - * we could try extending the page frags or chaining complex - * frags but even in these cases _eventually_ we will hit an - * OOM scenario. More complex recovery schemes may be - * implemented in the future, but BPF programs must handle - * the case where apply_cork requests are not honored. The - * canonical method to verify this is to check data length. - */ - if (psock->cork_bytes) { - if (copy > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= copy; - - if (psock->cork_bytes && !enospc) - goto out_cork; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (m && m != psock->cork) - free_start_sg(sk, m, true); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct sk_msg_buff md = {0}, *m = NULL; - int err = 0, copied = 0; - struct smap_psock *psock; - struct scatterlist *sg; - bool enospc = false; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto accept; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto accept; - rcu_read_unlock(); - - lock_sock(sk); - - if (psock->cork_bytes) { - m = psock->cork; - sg = &m->sg_data[m->sg_end]; - } else { - m = &md; - sg = m->sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(m->sg_end == m->sg_start && - m->sg_data[m->sg_end].length)) - goto out_err; - - psock->sg_size += size; - sg_set_page(sg, page, size, offset); - get_page(page); - m->sg_copy[m->sg_end] = true; - sk_mem_charge(sk, size); - m->sg_end++; - copied = size; - - if (m->sg_end == MAX_SKB_FRAGS) - m->sg_end = 0; - - if (m->sg_end == m->sg_start) - enospc = true; - - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - - if (psock->cork_bytes && !enospc) - goto out_err; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -accept: - rcu_read_unlock(); - return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, - struct sock *sk, - struct bpf_prog *tx_msg) -{ - struct bpf_prog *orig_tx_msg; - - orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); - if (orig_tx_msg) - bpf_prog_put(orig_tx_msg); -} - -static int bpf_tcp_ulp_register(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - /* Once BPF TX ULP is registered it is never unregistered. It - * will be in the ULP list for the lifetime of the system. Doing - * duplicate registers is not a problem. - */ - return tcp_register_ulp(&bpf_tcp_ulp_ops); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ - struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); - int rc; - - if (unlikely(!prog)) - return __SK_DROP; - - skb_orphan(skb); - /* We need to ensure that BPF metadata for maps is also cleared - * when we orphan the skb so that we don't have the possibility - * to reference a stale map. - */ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - rc = (*prog->bpf_func)(skb, prog->insnsi); - preempt_enable(); - skb->sk = NULL; - - /* Moving return codes from UAPI namespace into internal namespace */ - return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP; -} - -static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) -{ - struct sock *sk = psock->sock; - int copied = 0, num_sg; - struct sk_msg_buff *r; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); - if (unlikely(!r)) - return -EAGAIN; - - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(r); - return -EAGAIN; - } - - sg_init_table(r->sg_data, MAX_SKB_FRAGS); - num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); - if (unlikely(num_sg < 0)) { - kfree(r); - return num_sg; - } - sk_mem_charge(sk, skb->len); - copied = skb->len; - r->sg_start = 0; - r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; - r->skb = skb; - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - return copied; -} - -static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -{ - struct smap_psock *peer; - struct sock *sk; - __u32 in; - int rc; - - rc = smap_verdict_func(psock, skb); - switch (rc) { - case __SK_REDIRECT: - sk = do_sk_redirect_map(skb); - if (!sk) { - kfree_skb(skb); - break; - } - - peer = smap_psock_sk(sk); - in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - - if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || - !test_bit(SMAP_TX_RUNNING, &peer->state))) { - kfree_skb(skb); - break; - } - - if (!in && sock_writeable(sk)) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } else if (in && - atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } - /* Fall through and free skb otherwise */ - case __SK_DROP: - default: - kfree_skb(skb); - } -} - -static void smap_report_sk_error(struct smap_psock *psock, int err) -{ - struct sock *sk = psock->sock; - - sk->sk_err = err; - sk->sk_error_report(sk); -} - -static void smap_read_sock_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - smap_do_verdict(psock, skb); - rcu_read_unlock(); -} - -/* Called with lock held on socket */ -static void smap_data_ready(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); - } - rcu_read_unlock(); -} - -static void smap_tx_work(struct work_struct *w) -{ - struct smap_psock *psock; - struct sk_buff *skb; - int rem, off, n; - - psock = container_of(w, struct smap_psock, tx_work); - - /* lock sock to avoid losing sk_socket at some point during loop */ - lock_sock(psock->sock); - if (psock->save_skb) { - skb = psock->save_skb; - rem = psock->save_rem; - off = psock->save_off; - psock->save_skb = NULL; - goto start; - } - - while ((skb = skb_dequeue(&psock->rxqueue))) { - __u32 flags; - - rem = skb->len; - off = 0; -start: - flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - do { - if (likely(psock->sock->sk_socket)) { - if (flags) - n = smap_do_ingress(psock, skb); - else - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - } else { - n = -EINVAL; - } - - if (n <= 0) { - if (n == -EAGAIN) { - /* Retry when space is available */ - psock->save_skb = skb; - psock->save_rem = rem; - psock->save_off = off; - goto out; - } - /* Hard errors break pipe and stop xmit */ - smap_report_sk_error(psock, n ? -n : EPIPE); - clear_bit(SMAP_TX_RUNNING, &psock->state); - kfree_skb(skb); - goto out; - } - rem -= n; - off += n; - } while (rem); - - if (!flags) - kfree_skb(skb); - } -out: - release_sock(psock->sock); -} - -static void smap_write_space(struct sock *sk) -{ - struct smap_psock *psock; - void (*write_space)(struct sock *sk); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) - schedule_work(&psock->tx_work); - write_space = psock->save_write_space; - rcu_read_unlock(); - write_space(sk); -} - -static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) -{ - if (!psock->strp_enabled) - return; - sk->sk_data_ready = psock->save_data_ready; - sk->sk_write_space = psock->save_write_space; - psock->save_data_ready = NULL; - psock->save_write_space = NULL; - strp_stop(&psock->strp); - psock->strp_enabled = false; -} - -static void smap_destroy_psock(struct rcu_head *rcu) -{ - struct smap_psock *psock = container_of(rcu, - struct smap_psock, rcu); - - /* Now that a grace period has passed there is no longer - * any reference to this sock in the sockmap so we can - * destroy the psock, strparser, and bpf programs. But, - * because we use workqueue sync operations we can not - * do it in rcu context - */ - schedule_work(&psock->gc_work); -} - -static bool psock_is_smap_sk(struct sock *sk) -{ - return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock) -{ - if (refcount_dec_and_test(&psock->refcnt)) { - if (psock_is_smap_sk(sock)) - tcp_cleanup_ulp(sock); - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); - } -} - -static int smap_parse_func_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - struct bpf_prog *prog; - int rc; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - prog = READ_ONCE(psock->bpf_parse); - - if (unlikely(!prog)) { - rcu_read_unlock(); - return skb->len; - } - - /* Attach socket for bpf program to use if needed we can do this - * because strparser clones the skb before handing it to a upper - * layer, meaning skb_orphan has been called. We NULL sk on the - * way out to ensure we don't trigger a BUG_ON in skb/sk operations - * later and because we are not charging the memory of this skb to - * any socket yet. - */ - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - rc = (*prog->bpf_func)(skb, prog->insnsi); - skb->sk = NULL; - rcu_read_unlock(); - return rc; -} - -static int smap_read_sock_done(struct strparser *strp, int err) -{ - return err; -} - -static int smap_init_sock(struct smap_psock *psock, - struct sock *sk) -{ - static const struct strp_callbacks cb = { - .rcv_msg = smap_read_sock_strparser, - .parse_msg = smap_parse_func_strparser, - .read_sock_done = smap_read_sock_done, - }; - - return strp_init(&psock->strp, sk, &cb); -} - -static void smap_init_progs(struct smap_psock *psock, - struct bpf_prog *verdict, - struct bpf_prog *parse) -{ - struct bpf_prog *orig_parse, *orig_verdict; - - orig_parse = xchg(&psock->bpf_parse, parse); - orig_verdict = xchg(&psock->bpf_verdict, verdict); - - if (orig_verdict) - bpf_prog_put(orig_verdict); - if (orig_parse) - bpf_prog_put(orig_parse); -} - -static void smap_start_sock(struct smap_psock *psock, struct sock *sk) -{ - if (sk->sk_data_ready == smap_data_ready) - return; - psock->save_data_ready = sk->sk_data_ready; - psock->save_write_space = sk->sk_write_space; - sk->sk_data_ready = smap_data_ready; - sk->sk_write_space = smap_write_space; - psock->strp_enabled = true; -} - -static void sock_map_remove_complete(struct bpf_stab *stab) -{ - bpf_map_area_free(stab->sock_map); - kfree(stab); -} - -static void smap_gc_work(struct work_struct *w) -{ - struct smap_psock_map_entry *e, *tmp; - struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; - - psock = container_of(w, struct smap_psock, gc_work); - - /* no callback lock needed because we already detached sockmap ops */ - if (psock->strp_enabled) - strp_done(&psock->strp); - - cancel_work_sync(&psock->tx_work); - __skb_queue_purge(&psock->rxqueue); - - /* At this point all strparser and xmit work must be complete */ - if (psock->bpf_parse) - bpf_prog_put(psock->bpf_parse); - if (psock->bpf_verdict) - bpf_prog_put(psock->bpf_verdict); - if (psock->bpf_tx_msg) - bpf_prog_put(psock->bpf_tx_msg); - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - list_del(&e->list); - kfree(e); - } - - if (psock->sk_redir) - sock_put(psock->sk_redir); - - sock_put(psock->sock); - kfree(psock); -} - -static struct smap_psock *smap_init_psock(struct sock *sock, int node) -{ - struct smap_psock *psock; - - psock = kzalloc_node(sizeof(struct smap_psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return ERR_PTR(-ENOMEM); - - psock->eval = __SK_NONE; - psock->sock = sock; - skb_queue_head_init(&psock->rxqueue); - INIT_WORK(&psock->tx_work, smap_tx_work); - INIT_WORK(&psock->gc_work, smap_gc_work); - INIT_LIST_HEAD(&psock->maps); - INIT_LIST_HEAD(&psock->ingress); - refcount_set(&psock->refcnt, 1); - spin_lock_init(&psock->maps_lock); - - rcu_assign_sk_user_data(sock, psock); - sock_hold(sock); - return psock; -} - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ - struct bpf_stab *stab; - u64 cost; - int err; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - - stab = kzalloc(sizeof(*stab), GFP_USER); - if (!stab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); - - /* make sure page count doesn't overflow */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = -EINVAL; - if (cost >= U32_MAX - PAGE_SIZE) - goto free_stab; - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(stab->map.pages); - if (err) - goto free_stab; - - err = -ENOMEM; - stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *), - stab->map.numa_node); - if (!stab->sock_map) - goto free_stab; - - return &stab->map; -free_stab: - kfree(stab); - return ERR_PTR(err); -} - -static void smap_list_map_remove(struct smap_psock *psock, - struct sock **entry) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void smap_list_hash_remove(struct smap_psock *psock, - struct htab_elem *hash_link) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = rcu_dereference(e->hash_link); - - if (c == hash_link) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void sock_map_free(struct bpf_map *map) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); - for (i = 0; i < stab->map.max_entries; i++) { - struct smap_psock *psock; - struct sock *sock; - - sock = stab->sock_map[i]; - if (!sock) - continue; - stab->sock_map[i] = NULL; - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); - } - } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); - - sock_map_remove_complete(stab); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; - - if (i >= stab->map.max_entries) { - *next = 0; - return 0; - } - - if (i == stab->map.max_entries - 1) - return -ENOENT; - - *next = i + 1; - return 0; -} - -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - if (key >= map->max_entries) - return NULL; - - return READ_ONCE(stab->sock_map[key]); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock *psock; - int k = *(u32 *)key; - struct sock *sock; - - if (k >= map->max_entries) - return -EINVAL; - - raw_spin_lock_bh(&stab->lock); - sock = stab->sock_map[k]; - stab->sock_map[k] = NULL; - raw_spin_unlock_bh(&stab->lock); - if (!sock) - return -EINVAL; - - psock = smap_psock_sk(sock); - if (!psock) - return 0; - if (psock->bpf_parse) { - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - smap_list_map_remove(psock, &stab->sock_map[k]); - smap_release_sock(psock, sock); - return 0; -} - -/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are - * done inside rcu critical sections. This ensures on updates that the psock - * will not be released via smap_release_sock() until concurrent updates/deletes - * complete. All operations operate on sock_map using cmpxchg and xchg - * operations to ensure we do not get stale references. Any reads into the - * map must be done with READ_ONCE() because of this. - * - * A psock is destroyed via call_rcu and after any worker threads are cancelled - * and syncd so we are certain all references from the update/lookup/delete - * operations as well as references in the data path are no longer in use. - * - * Psocks may exist in multiple maps, but only a single set of parse/verdict - * programs may be inherited from the maps it belongs to. A reference count - * is kept with the total number of references to the psock from all maps. The - * psock will not be released until this reaches zero. The psock and sock - * user data data use the sk_callback_lock to protect critical data structures - * from concurrent access. This allows us to avoid two updates from modifying - * the user data in sock and the lock is required anyways for modifying - * callbacks, we simply increase its scope slightly. - * - * Rules to follow, - * - psock must always be read inside RCU critical section - * - sk_user_data must only be modified inside sk_callback_lock and read - * inside RCU critical section. - * - psock->maps list must only be read & modified inside sk_callback_lock - * - sock_map must use READ_ONCE and (cmp)xchg operations - * - BPF verdict/parse programs must use READ_ONCE and xchg operations - */ - -static int __sock_map_ctx_update_elem(struct bpf_map *map, - struct bpf_sock_progs *progs, - struct sock *sock, - void *key) -{ - struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock *psock; - bool new = false; - int err = 0; - - /* 1. If sock map has BPF programs those will be inherited by the - * sock being added. If the sock is already attached to BPF programs - * this results in an error. - */ - verdict = READ_ONCE(progs->bpf_verdict); - parse = READ_ONCE(progs->bpf_parse); - tx_msg = READ_ONCE(progs->bpf_tx_msg); - - if (parse && verdict) { - /* bpf prog refcnt may be zero if a concurrent attach operation - * removes the program after the above READ_ONCE() but before - * we increment the refcnt. If this is the case abort with an - * error. - */ - verdict = bpf_prog_inc_not_zero(verdict); - if (IS_ERR(verdict)) - return PTR_ERR(verdict); - - parse = bpf_prog_inc_not_zero(parse); - if (IS_ERR(parse)) { - bpf_prog_put(verdict); - return PTR_ERR(parse); - } - } - - if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(tx_msg); - if (IS_ERR(tx_msg)) { - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - return PTR_ERR(tx_msg); - } - } - - psock = smap_psock_sk(sock); - - /* 2. Do not allow inheriting programs if psock exists and has - * already inherited programs. This would create confusion on - * which parser/verdict program is running. If no psock exists - * create one. Inside sk_callback_lock to ensure concurrent create - * doesn't update user data. - */ - if (psock) { - if (!psock_is_smap_sk(sock)) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_parse) && parse) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { - err = -EBUSY; - goto out_progs; - } - if (!refcount_inc_not_zero(&psock->refcnt)) { - err = -EAGAIN; - goto out_progs; - } - } else { - psock = smap_init_psock(sock, map->numa_node); - if (IS_ERR(psock)) { - err = PTR_ERR(psock); - goto out_progs; - } - - set_bit(SMAP_TX_RUNNING, &psock->state); - new = true; - } - - /* 3. At this point we have a reference to a valid psock that is - * running. Attach any BPF programs needed. - */ - if (tx_msg) - bpf_tcp_msg_add(psock, sock, tx_msg); - if (new) { - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_free; - } - - if (parse && verdict && !psock->strp_enabled) { - err = smap_init_sock(psock, sock); - if (err) - goto out_free; - smap_init_progs(psock, verdict, parse); - write_lock_bh(&sock->sk_callback_lock); - smap_start_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - - return err; -out_free: - smap_release_sock(psock, sock); -out_progs: - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - if (tx_msg) - bpf_prog_put(tx_msg); - return err; -} - -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock = skops->sk; - struct smap_psock_map_entry *e; - struct smap_psock *psock; - u32 i = *(u32 *)key; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto out; - - /* psock guaranteed to be present. */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&stab->lock); - osock = stab->sock_map[i]; - if (osock && flags == BPF_NOEXIST) { - err = -EEXIST; - goto out_unlock; - } - if (!osock && flags == BPF_EXIST) { - err = -ENOENT; - goto out_unlock; - } - - e->entry = &stab->sock_map[i]; - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - stab->sock_map[i] = sock; - if (osock) { - psock = smap_psock_sk(osock); - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, osock); - } - raw_spin_unlock_bh(&stab->lock); - return 0; -out_unlock: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&stab->lock); -out: - kfree(e); - return err; -} - -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } else { - return -EINVAL; - } - - switch (type) { - case BPF_SK_MSG_VERDICT: - orig = xchg(&progs->bpf_tx_msg, prog); - break; - case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&progs->bpf_parse, prog); - break; - case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&progs->bpf_verdict, prog); - break; - default: - return -EOPNOTSUPP; - } - - if (orig) - bpf_prog_put(orig); - - return 0; -} - -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) -{ - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - err = sock_map_prog(map, prog, attr->attach_type); - fdput(f); - return err; -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ - return NULL; -} - -static int sock_map_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_map_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static void sock_map_release(struct bpf_map *map) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } - - orig = xchg(&progs->bpf_parse, NULL); - if (orig) - bpf_prog_put(orig); - orig = xchg(&progs->bpf_verdict, NULL); - if (orig) - bpf_prog_put(orig); - - orig = xchg(&progs->bpf_tx_msg, NULL); - if (orig) - bpf_prog_put(orig); -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ - struct bpf_htab *htab; - int i, err; - u64 cost; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || - attr->key_size == 0 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - - htab = kzalloc(sizeof(*htab), GFP_USER); - if (!htab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&htab->map, attr); - - htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8); - err = -EINVAL; - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) - goto free_htab; - - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (cost >= U32_MAX - PAGE_SIZE) - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(htab->map.pages); - if (err) - goto free_htab; - - err = -ENOMEM; - htab->buckets = bpf_map_area_alloc( - htab->n_buckets * sizeof(struct bucket), - htab->map.numa_node); - if (!htab->buckets) - goto free_htab; - - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); - } - - return &htab->map; -free_htab: - kfree(htab); - return ERR_PTR(err); -} - -static void __bpf_htab_free(struct rcu_head *rcu) -{ - struct bpf_htab *htab; - - htab = container_of(rcu, struct bpf_htab, rcu); - bpf_map_area_free(htab->buckets); - kfree(htab); -} - -static void sock_hash_free(struct bpf_map *map) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - for (i = 0; i < htab->n_buckets; i++) { - struct bucket *b = __select_bucket(htab, i); - struct hlist_head *head; - struct hlist_node *n; - struct htab_elem *l; - - raw_spin_lock_bh(&b->lock); - head = &b->head; - hlist_for_each_entry_safe(l, n, head, hash_node) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get - * the sk_callback_lock before this case but after xchg - * causing the refcnt to hit zero and sock user data - * (psock) to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - } - raw_spin_unlock_bh(&b->lock); - } - rcu_read_unlock(); - call_rcu(&htab->rcu, __bpf_htab_free); -} - -static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, - void *key, u32 key_size, u32 hash, - struct sock *sk, - struct htab_elem *old_elem) -{ - struct htab_elem *l_new; - - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - if (!old_elem) { - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); - } - } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); - if (!l_new) { - atomic_dec(&htab->count); - return ERR_PTR(-ENOMEM); - } - - memcpy(l_new->key, key, key_size); - l_new->sk = sk; - l_new->hash = hash; - return l_new; -} - -static inline u32 htab_map_hash(const void *key, u32 key_len) -{ - return jhash(key, key_len, 0); -} - -static int sock_hash_get_next_key(struct bpf_map *map, - void *key, void *next_key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l, *next_l; - struct hlist_head *h; - u32 hash, key_size; - int i = 0; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - key_size = map->key_size; - if (!key) - goto find_first_elem; - hash = htab_map_hash(key, key_size); - h = select_bucket(htab, hash); - - l = lookup_elem_raw(h, hash, key, key_size); - if (!l) - goto find_first_elem; - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), - struct htab_elem, hash_node); - if (next_l) { - memcpy(next_key, next_l->key, key_size); - return 0; - } - - /* no more elements in this hash list, go to the next bucket */ - i = hash & (htab->n_buckets - 1); - i++; - -find_first_elem: - /* iterate over buckets */ - for (; i < htab->n_buckets; i++) { - h = select_bucket(htab, i); - - /* pick first element in the bucket */ - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(h)), - struct htab_elem, hash_node); - if (next_l) { - /* if it's not empty, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; - } - } - - /* iterated over all buckets and all elements */ - return -ENOENT; -} - -static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 map_flags) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_sock_progs *progs = &htab->progs; - struct htab_elem *l_new = NULL, *l_old; - struct smap_psock_map_entry *e = NULL; - struct hlist_head *head; - struct smap_psock *psock; - u32 key_size, hash; - struct sock *sock; - struct bucket *b; - int err; - - sock = skops->sk; - - if (sock->sk_type != SOCK_STREAM || - sock->sk_protocol != IPPROTO_TCP) - return -EOPNOTSUPP; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - WARN_ON_ONCE(!rcu_read_lock_held()); - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto err; - - /* psock is valid here because otherwise above *ctx_update_elem would - * have thrown an error. It is safe to skip error check. - */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&b->lock); - l_old = lookup_elem_raw(head, hash, key, key_size); - if (l_old && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto bucket_err; - } - if (!l_old && map_flags == BPF_EXIST) { - err = -ENOENT; - goto bucket_err; - } - - l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); - if (IS_ERR(l_new)) { - err = PTR_ERR(l_new); - goto bucket_err; - } - - rcu_assign_pointer(e->hash_link, l_new); - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - /* add new element to the head of the list, so that - * concurrent search will find it before old elem - */ - hlist_add_head_rcu(&l_new->hash_node, head); - if (l_old) { - psock = smap_psock_sk(l_old->sk); - - hlist_del_rcu(&l_old->hash_node); - smap_list_hash_remove(psock, l_old); - smap_release_sock(psock, l_old->sk); - free_htab_elem(htab, l_old); - } - raw_spin_unlock_bh(&b->lock); - return 0; -bucket_err: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&b->lock); -err: - kfree(e); - return err; -} - -static int sock_hash_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_hash_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct bucket *b; - struct htab_elem *l; - u32 hash, key_size; - int ret = -ENOENT; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, hash, key, key_size); - if (l) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - ret = 0; - } - raw_spin_unlock_bh(&b->lock); - return ret; -} - -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - u32 key_size, hash; - struct bucket *b; - struct sock *sk; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - l = lookup_elem_raw(head, hash, key, key_size); - sk = l ? l->sk : NULL; - return sk; -} - -const struct bpf_map_ops sock_map_ops = { - .map_alloc = sock_map_alloc, - .map_free = sock_map_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_map_get_next_key, - .map_update_elem = sock_map_update_elem, - .map_delete_elem = sock_map_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops sock_hash_ops = { - .map_alloc = sock_hash_alloc, - .map_free = sock_hash_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, - .map_delete_elem = sock_hash_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) -{ - return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || - ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; -} -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. This checks that the sock ops triggering the update is - * one indicating we are (or will be soon) in an ESTABLISHED state. - */ - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_map_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { - .func = bpf_sock_map_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { - .func = bpf_sock_hash_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8061a439ef18..b2ade10f7ec3 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_get_stack_proto = { /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5742df21598c..f4ecd6ed2252 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -719,10 +719,15 @@ static int map_lookup_elem(union bpf_attr *attr) } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); - if (ptr) + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; memcpy(value, ptr, value_size); + } rcu_read_unlock(); - err = ptr ? 0 : -ENOENT; } if (err) @@ -743,6 +748,17 @@ err_put: return err; } +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ + /* Wait for any running BPF programs to complete so that + * userspace, when we return to it, knows that all programs + * that could be running use the new map value. + */ + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) + synchronize_rcu(); +} + #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) @@ -837,6 +853,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); + maybe_wait_bpf_programs(map); out: free_value: kfree(value); @@ -889,6 +906,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); + maybe_wait_bpf_programs(map); out: kfree(key); err_put: @@ -1646,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - ret = sockmap_get_from_fd(attr, ptype, prog); + ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); @@ -1700,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..ef0b7b6ef8a5 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map) static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, diff --git a/net/Kconfig b/net/Kconfig index 228dfa382eec..f235edb593ba 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -300,8 +300,11 @@ config BPF_JIT config BPF_STREAM_PARSER bool "enable BPF STREAM_PARSER" + depends on INET depends on BPF_SYSCALL + depends on CGROUP_BPF select STREAM_PARSER + select NET_SOCK_MSG ---help--- Enabling this allows a stream parser to be used with BPF_MAP_TYPE_SOCKMAP. @@ -413,6 +416,14 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_SOCK_MSG + bool + default n + help + The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or + ULPs (upper layer modules, e.g. TLS) to process L7 application data + with the help of BPF programs. + config NET_DEVLINK tristate "Network physical/parent device Netlink interface" help diff --git a/net/core/Makefile b/net/core/Makefile index 80175e6a2eb8..fccd31e0e7f7 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o +obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/dev.c b/net/core/dev.c index 8497feea8fb5..022ad73d6253 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4291,6 +4291,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + __be16 orig_eth_type; + struct ethhdr *eth; + bool orig_bcast; int hlen, off; u32 mac_len; @@ -4331,6 +4334,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_hard_start = skb->data - skb_headroom(skb); orig_data_end = xdp->data_end; orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; rxqueue = netif_get_rxqueue(skb); xdp->rxq = &rxqueue->xdp_rxq; @@ -4354,6 +4360,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, } + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + } + switch (act) { case XDP_REDIRECT: case XDP_TX: diff --git a/net/core/filter.c b/net/core/filter.c index 80da21b097b8..1a3ac6c46873 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -38,6 +38,7 @@ #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> +#include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> @@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, - struct bpf_map *, map, void *, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { - .func = bpf_sk_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, - struct bpf_map *, map, u32, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_sk_redirect_map(struct sk_buff *skb) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - return tcb->bpf.sk_redir; -} - -static const struct bpf_func_proto bpf_sk_redirect_map_proto = { - .func = bpf_sk_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, - struct bpf_map *, map, void *, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_hash_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { - .func = bpf_msg_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_map_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) -{ - return msg->sk_redir; -} - -static const struct bpf_func_proto bpf_msg_redirect_map_proto = { - .func = bpf_msg_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; @@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; @@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -#define sk_msg_iter_var(var) \ - do { \ - var++; \ - if (var == MAX_SKB_FRAGS) \ - var = 0; \ - } while (0) - -BPF_CALL_4(bpf_msg_pull_data, - struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, + u32, end, u64, flags) { - unsigned int len = 0, offset = 0, copy = 0, poffset = 0; - int bytes = end - start, bytes_sg_total; - struct scatterlist *sg = msg->sg_data; - int first_sg, last_sg, i, shift; - unsigned char *p, *to, *from; + u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; + u32 first_sge, last_sge, i, shift, bytes_sg_total; + struct scatterlist *sge; + u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ - i = msg->sg_start; + i = msg->sg.start; do { - len = sg[i].length; + len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; offset += len; - sk_msg_iter_var(i); - } while (i != msg->sg_end); + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; - first_sg = i; + first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg_copy[i] && bytes_sg_total <= len) + if (!msg->sg.copy[i] && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data, * will copy the entire sg entry. */ do { - copy += sg[i].length; - sk_msg_iter_var(i); + copy += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; - } while (i != msg->sg_end); - last_sg = i; + } while (i != msg->sg.end); + last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; @@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data, get_order(copy)); if (unlikely(!page)) return -ENOMEM; - p = page_address(page); - i = first_sg; + raw = page_address(page); + i = first_sge; do { - from = sg_virt(&sg[i]); - len = sg[i].length; - to = p + poffset; + sge = sk_msg_elem(msg, i); + from = sg_virt(sge); + len = sge->length; + to = raw + poffset; memcpy(to, from, len); poffset += len; - sg[i].length = 0; - put_page(sg_page(&sg[i])); + sge->length = 0; + put_page(sg_page(sge)); - sk_msg_iter_var(i); - } while (i != last_sg); + sk_msg_iter_var_next(i); + } while (i != last_sge); - sg[first_sg].length = copy; - sg_set_page(&sg[first_sg], page, copy, 0); + sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - WARN_ON_ONCE(last_sg == first_sg); - shift = last_sg > first_sg ? - last_sg - first_sg - 1 : - MAX_SKB_FRAGS - first_sg + last_sg - 1; + WARN_ON_ONCE(last_sge == first_sge); + shift = last_sge > first_sge ? + last_sge - first_sge - 1 : + MAX_SKB_FRAGS - first_sge + last_sge - 1; if (!shift) goto out; - i = first_sg; - sk_msg_iter_var(i); + i = first_sge; + sk_msg_iter_var_next(i); do { - int move_from; + u32 move_from; - if (i + shift >= MAX_SKB_FRAGS) - move_from = i + shift - MAX_SKB_FRAGS; + if (i + shift >= MAX_MSG_FRAGS) + move_from = i + shift - MAX_MSG_FRAGS; else move_from = i + shift; - - if (move_from == msg->sg_end) + if (move_from == msg->sg.end) break; - sg[i] = sg[move_from]; - sg[move_from].length = 0; - sg[move_from].page_link = 0; - sg[move_from].offset = 0; - - sk_msg_iter_var(i); + msg->sg.data[i] = msg->sg.data[move_from]; + msg->sg.data[move_from].length = 0; + msg->sg.data[move_from].page_link = 0; + msg->sg.data[move_from].offset = 0; + sk_msg_iter_var_next(i); } while (1); - msg->sg_end -= shift; - if (msg->sg_end < 0) - msg->sg_end += MAX_SKB_FRAGS; + + msg->sg.end = msg->sg.end - shift > msg->sg.end ? + msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift; out: - msg->data = sg_virt(&sg[first_sg]) + start - offset; + msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; - return 0; } @@ -4821,9 +4696,12 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct sk_buff *skb, u8 family, u8 proto) { - int dif = skb->dev->ifindex; bool refcounted = false; struct sock *sk = NULL; + int dif = 0; + + if (skb->dev) + dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; @@ -4839,21 +4717,24 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &udp_table, skb); -#if IS_REACHABLE(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + u16 hnum = ntohs(tuple->ipv6.dport); int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, + dst6, hnum, dif, sdif, &refcounted); - else - sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, - dif, sdif, &udp_table, skb); + else if (likely(ipv6_bpf_stub)) + sk = ipv6_bpf_stub->udp6_lib_lookup(net, + src6, tuple->ipv6.sport, + dst6, hnum, + dif, sdif, + &udp_table, skb); #endif } @@ -5200,6 +5081,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; + static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5223,6 +5107,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5244,6 +5131,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -6998,22 +6888,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct sk_msg_md, data): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data)); + offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data_end)); + offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; @@ -7022,9 +6912,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; @@ -7034,9 +6924,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); @@ -7051,9 +6941,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + @@ -7072,9 +6962,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + @@ -7088,9 +6978,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD @@ -7102,9 +6992,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; diff --git a/net/core/skmsg.c b/net/core/skmsg.c new file mode 100644 index 000000000000..56a99d0c9aa0 --- /dev/null +++ b/net/core/skmsg.c @@ -0,0 +1,802 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/skmsg.h> +#include <linux/skbuff.h> +#include <linux/scatterlist.h> + +#include <net/sock.h> +#include <net/tcp.h> + +static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) +{ + if (msg->sg.end > msg->sg.start && + elem_first_coalesce < msg->sg.end) + return true; + + if (msg->sg.end < msg->sg.start && + (elem_first_coalesce > msg->sg.start || + elem_first_coalesce < msg->sg.end)) + return true; + + return false; +} + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce) +{ + struct page_frag *pfrag = sk_page_frag(sk); + int ret = 0; + + len -= msg->sg.size; + while (len > 0) { + struct scatterlist *sge; + u32 orig_offset; + int use, i; + + if (!sk_page_frag_refill(sk, pfrag)) + return -ENOMEM; + + orig_offset = pfrag->offset; + use = min_t(int, len, pfrag->size - orig_offset); + if (!sk_wmem_schedule(sk, use)) + return -ENOMEM; + + i = msg->sg.end; + sk_msg_iter_var_prev(i); + sge = &msg->sg.data[i]; + + if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && + sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; + } else { + if (sk_msg_full(msg)) { + ret = -ENOSPC; + break; + } + + sge = &msg->sg.data[msg->sg.end]; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sk_msg_iter_next(msg, end); + } + + sk_mem_charge(sk, use); + msg->sg.size += use; + pfrag->offset += use; + len -= use; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_alloc); + +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + u32 sge_len, sge_off; + + if (sk_msg_full(dst)) + return -ENOSPC; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + sge_off = sge->offset + off; + if (sge_len > len) + sge_len = len; + off = 0; + len -= sge_len; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (bytes < sge->length) { + sge->length -= bytes; + sge->offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sge->length); + bytes -= sge->length; + sge->length = 0; + sge->offset = 0; + sk_msg_iter_var_next(i); + } while (bytes && i != msg->sg.end); + msg->sg.start = i; +} +EXPORT_SYMBOL_GPL(sk_msg_return_zero); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = &msg->sg.data[i]; + int uncharge = (bytes < sge->length) ? bytes : sge->length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +} +EXPORT_SYMBOL_GPL(sk_msg_return); + +static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + u32 len = sge->length; + + if (charge) + sk_mem_uncharge(sk, len); + if (!msg->skb) + put_page(sg_page(sge)); + memset(sge, 0, sizeof(*sge)); + return len; +} + +static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int freed = 0; + + while (msg->sg.size) { + msg->sg.size -= sge->length; + freed += sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, msg->sg.size); + sge = sk_msg_elem(msg, i); + } + if (msg->skb) + consume_skb(msg->skb); + sk_msg_init(msg); + return freed; +} + +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, false); +} +EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); + +int sk_msg_free(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free); + +static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, + u32 bytes, bool charge) +{ + struct scatterlist *sge; + u32 i = msg->sg.start; + + while (bytes) { + sge = sk_msg_elem(msg, i); + if (!sge->length) + break; + if (bytes < sge->length) { + if (charge) + sk_mem_uncharge(sk, bytes); + sge->length -= bytes; + sge->offset += bytes; + msg->sg.size -= bytes; + break; + } + + msg->sg.size -= sge->length; + bytes -= sge->length; + sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, bytes); + } + msg->sg.start = i; +} + +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free_partial); + +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, false); +} + +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) +{ + int trim = msg->sg.size - len; + u32 i = msg->sg.end; + + if (trim <= 0) { + WARN_ON(trim < 0); + return; + } + + sk_msg_iter_var_prev(i); + msg->sg.size = len; + while (msg->sg.data[i].length && + trim >= msg->sg.data[i].length) { + trim -= msg->sg.data[i].length; + sk_msg_free_elem(sk, msg, i, true); + sk_msg_iter_var_prev(i); + if (!trim) + goto out; + } + + msg->sg.data[i].length -= trim; + sk_mem_uncharge(sk, trim); +out: + /* If we trim data before curr pointer update copybreak and current + * so that any future copy operations start at new copy location. + * However trimed data that has not yet been used in a copy op + * does not require an update. + */ + if (msg->sg.curr >= i) { + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } + sk_msg_iter_var_next(i); + msg->sg.end = i; +} +EXPORT_SYMBOL_GPL(sk_msg_trim); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); + const int to_max_pages = MAX_MSG_FRAGS; + struct page *pages[MAX_MSG_FRAGS]; + ssize_t orig, copied, use, offset; + + orig = msg->sg.size; + while (bytes > 0) { + i = 0; + maxpages = to_max_pages - num_elems; + if (maxpages == 0) { + ret = -EFAULT; + goto out; + } + + copied = iov_iter_get_pages(from, pages, bytes, maxpages, + &offset); + if (copied <= 0) { + ret = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + bytes -= copied; + msg->sg.size += copied; + + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + sg_set_page(&msg->sg.data[msg->sg.end], + pages[i], use, offset); + sg_unmark_end(&msg->sg.data[msg->sg.end]); + sk_mem_charge(sk, use); + + offset = 0; + copied -= use; + sk_msg_iter_next(msg, end); + num_elems++; + i++; + } + /* When zerocopy is mixed with sk_msg_*copy* operations we + * may have a copybreak set in this case clear and prefer + * zerocopy remainder when possible. + */ + msg->sg.copybreak = 0; + msg->sg.curr = msg->sg.end; + } +out: + /* Revert iov_iter updates, msg will need to use 'trim' later if it + * also needs to be cleared. + */ + if (ret) + iov_iter_revert(from, msg->sg.size - orig); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); + +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int ret = -ENOSPC, i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *to; + + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + ret = copy_from_iter_nocache(to, copy, from); + else + ret = copy_from_iter(to, copy, from); + if (ret != copy) { + ret = -EFAULT; + goto out; + } + bytes -= copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +out: + msg->sg.curr = i; + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + int copied = 0, num_sge; + struct sk_msg *msg; + + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!msg)) + return -EAGAIN; + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(msg); + return -EAGAIN; + } + + sk_msg_init(msg); + num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + if (unlikely(num_sge < 0)) { + kfree(msg); + return num_sge; + } + + sk_mem_charge(sk, skb->len); + copied = skb->len; + msg->sg.start = 0; + msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->skb = skb; + + sk_psock_queue_msg(psock, msg); + sk->sk_data_ready(sk); + return copied; +} + +static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool ingress) +{ + if (ingress) + return sk_psock_skb_ingress(psock, skb); + else + return skb_send_sock_locked(psock->sk, skb, off, len); +} + +static void sk_psock_backlog(struct work_struct *work) +{ + struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct sk_psock_work_state *state = &psock->work_state; + struct sk_buff *skb; + bool ingress; + u32 len, off; + int ret; + + /* Lock sock to avoid losing sk_socket during loop. */ + lock_sock(psock->sk); + if (state->skb) { + skb = state->skb; + len = state->len; + off = state->off; + state->skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->ingress_skb))) { + len = skb->len; + off = 0; +start: + ingress = tcp_skb_bpf_ingress(skb); + do { + ret = -EIO; + if (likely(psock->sk->sk_socket)) + ret = sk_psock_handle_skb(psock, skb, off, + len, ingress); + if (ret <= 0) { + if (ret == -EAGAIN) { + state->skb = skb; + state->len = len; + state->off = off; + goto end; + } + /* Hard errors break pipe and stop xmit. */ + sk_psock_report_error(psock, ret ? -ret : EPIPE); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + kfree_skb(skb); + goto end; + } + off += ret; + len -= ret; + } while (len); + + if (!ingress) + kfree_skb(skb); + } +end: + release_sock(psock->sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node) +{ + struct sk_psock *psock = kzalloc_node(sizeof(*psock), + GFP_ATOMIC | __GFP_NOWARN, + node); + if (!psock) + return NULL; + + psock->sk = sk; + psock->eval = __SK_NONE; + + INIT_LIST_HEAD(&psock->link); + spin_lock_init(&psock->link_lock); + + INIT_WORK(&psock->work, sk_psock_backlog); + INIT_LIST_HEAD(&psock->ingress_msg); + skb_queue_head_init(&psock->ingress_skb); + + sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); + refcount_set(&psock->refcnt, 1); + + rcu_assign_sk_user_data(sk, psock); + sock_hold(sk); + + return psock; +} +EXPORT_SYMBOL_GPL(sk_psock_init); + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) +{ + struct sk_psock_link *link; + + spin_lock_bh(&psock->link_lock); + link = list_first_entry_or_null(&psock->link, struct sk_psock_link, + list); + if (link) + list_del(&link->list); + spin_unlock_bh(&psock->link_lock); + return link; +} + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +{ + struct sk_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { + list_del(&msg->list); + sk_msg_free(psock->sk, msg); + kfree(msg); + } +} + +static void sk_psock_zap_ingress(struct sk_psock *psock) +{ + __skb_queue_purge(&psock->ingress_skb); + __sk_psock_purge_ingress_msg(psock); +} + +static void sk_psock_link_destroy(struct sk_psock *psock) +{ + struct sk_psock_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &psock->link, list) { + list_del(&link->list); + sk_psock_free_link(link); + } +} + +static void sk_psock_destroy_deferred(struct work_struct *gc) +{ + struct sk_psock *psock = container_of(gc, struct sk_psock, gc); + + /* No sk_callback_lock since already detached. */ + if (psock->parser.enabled) + strp_done(&psock->parser.strp); + + cancel_work_sync(&psock->work); + + psock_progs_drop(&psock->progs); + + sk_psock_link_destroy(psock); + sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); + + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sk); + kfree(psock); +} + +void sk_psock_destroy(struct rcu_head *rcu) +{ + struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); + + INIT_WORK(&psock->gc, sk_psock_destroy_deferred); + schedule_work(&psock->gc); +} +EXPORT_SYMBOL_GPL(sk_psock_destroy); + +void sk_psock_drop(struct sock *sk, struct sk_psock *psock) +{ + rcu_assign_sk_user_data(sk, NULL); + sk_psock_cork_free(psock); + sk_psock_restore_proto(sk, psock); + + write_lock_bh(&sk->sk_callback_lock); + if (psock->progs.skb_parser) + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + + call_rcu_sched(&psock->rcu, sk_psock_destroy); +} +EXPORT_SYMBOL_GPL(sk_psock_drop); + +static int sk_psock_map_verd(int verdict, bool redir) +{ + switch (verdict) { + case SK_PASS: + return redir ? __SK_REDIRECT : __SK_PASS; + case SK_DROP: + default: + break; + } + + return __SK_DROP; +} + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg) +{ + struct bpf_prog *prog; + int ret; + + preempt_disable(); + rcu_read_lock(); + prog = READ_ONCE(psock->progs.msg_parser); + if (unlikely(!prog)) { + ret = __SK_PASS; + goto out; + } + + sk_msg_compute_data_pointers(msg); + msg->sk = sk; + ret = BPF_PROG_RUN(prog, msg); + ret = sk_psock_map_verd(ret, msg->sk_redir); + psock->apply_bytes = msg->apply_bytes; + if (ret == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = msg->sk_redir; + if (!psock->sk_redir) { + ret = __SK_DROP; + goto out; + } + sock_hold(psock->sk_redir); + } +out: + rcu_read_unlock(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); + +static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, + struct sk_buff *skb) +{ + int ret; + + skb->sk = psock->sk; + bpf_compute_data_end_sk_skb(skb); + preempt_disable(); + ret = BPF_PROG_RUN(prog, skb); + preempt_enable(); + /* strparser clones the skb before handing it to a upper layer, + * meaning skb_orphan has been called. We NULL sk on the way out + * to ensure we don't trigger a BUG_ON() in skb/sk operations + * later and because we are not charging the memory of this skb + * to any socket yet. + */ + skb->sk = NULL; + return ret; +} + +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sk_psock *psock_other; + struct sock *sk_other; + bool ingress; + + switch (verdict) { + case __SK_REDIRECT: + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) + goto out_free; + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) + goto out_free; + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + break; + } + /* fall-through */ + case __SK_DROP: + /* fall-through */ + default: +out_free: + kfree_skb(skb); + } +} + +static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = __SK_DROP; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + skb_orphan(skb); + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_verdict_apply(psock, skb, ret); +} + +static int sk_psock_strp_read_done(struct strparser *strp, int err) +{ + return err; +} + +static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = skb->len; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_parser); + if (likely(prog)) + ret = sk_psock_bpf_run(psock, prog, skb); + rcu_read_unlock(); + return ret; +} + +/* Called with socket lock held. */ +static void sk_psock_data_ready(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } + rcu_read_unlock(); +} + +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + rcu_read_unlock(); + write_space(sk); +} + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} diff --git a/net/core/sock.c b/net/core/sock.c index fdf9fc7d3f98..6fcc4bc07d19 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2239,67 +2239,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); - static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/core/sock_map.c b/net/core/sock_map.c new file mode 100644 index 000000000000..3c0e44cb811a --- /dev/null +++ b/net/core/sock_map.c @@ -0,0 +1,1002 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/net.h> +#include <linux/workqueue.h> +#include <linux/skmsg.h> +#include <linux/list.h> +#include <linux/jhash.h> + +struct bpf_stab { + struct bpf_map map; + struct sock **sks; + struct sk_psock_progs progs; + raw_spinlock_t lock; +}; + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + u64 cost; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); + + /* Make sure page count doesn't overflow. */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_stab; + } + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + stab->sks = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (stab->sks) + return &stab->map; + err = -ENOMEM; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) +{ + u32 ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int ret; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + ret = sock_map_prog_update(map, prog, attr->attach_type); + fdput(f); + return ret; +} + +static void sock_map_sk_acquire(struct sock *sk) + __acquires(&sk->sk_lock.slock) +{ + lock_sock(sk); + preempt_disable(); + rcu_read_lock(); +} + +static void sock_map_sk_release(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + rcu_read_unlock(); + preempt_enable(); + release_sock(sk); +} + +static void sock_map_add_link(struct sk_psock *psock, + struct sk_psock_link *link, + struct bpf_map *map, void *link_raw) +{ + link->link_raw = link_raw; + link->map = map; + spin_lock_bh(&psock->link_lock); + list_add_tail(&link->list, &psock->link); + spin_unlock_bh(&psock->link_lock); +} + +static void sock_map_del_link(struct sock *sk, + struct sk_psock *psock, void *link_raw) +{ + struct sk_psock_link *link, *tmp; + bool strp_stop = false; + + spin_lock_bh(&psock->link_lock); + list_for_each_entry_safe(link, tmp, &psock->link, list) { + if (link->link_raw == link_raw) { + struct bpf_map *map = link->map; + struct bpf_stab *stab = container_of(map, struct bpf_stab, + map); + if (psock->parser.enabled && stab->progs.skb_parser) + strp_stop = true; + list_del(&link->list); + sk_psock_free_link(link); + } + } + spin_unlock_bh(&psock->link_lock); + if (strp_stop) { + write_lock_bh(&sk->sk_callback_lock); + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + } +} + +static void sock_map_unref(struct sock *sk, void *link_raw) +{ + struct sk_psock *psock = sk_psock(sk); + + if (likely(psock)) { + sock_map_del_link(sk, psock, link_raw); + sk_psock_put(sk, psock); + } +} + +static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, + struct sock *sk) +{ + struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + bool skb_progs, sk_psock_is_new = false; + struct sk_psock *psock; + int ret; + + skb_verdict = READ_ONCE(progs->skb_verdict); + skb_parser = READ_ONCE(progs->skb_parser); + skb_progs = skb_parser && skb_verdict; + if (skb_progs) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) + return PTR_ERR(skb_verdict); + skb_parser = bpf_prog_inc_not_zero(skb_parser); + if (IS_ERR(skb_parser)) { + bpf_prog_put(skb_verdict); + return PTR_ERR(skb_parser); + } + } + + msg_parser = READ_ONCE(progs->msg_parser); + if (msg_parser) { + msg_parser = bpf_prog_inc_not_zero(msg_parser); + if (IS_ERR(msg_parser)) { + ret = PTR_ERR(msg_parser); + goto out; + } + } + + psock = sk_psock_get(sk); + if (psock) { + if (!sk_has_psock(sk)) { + ret = -EBUSY; + goto out_progs; + } + if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || + (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + sk_psock_put(sk, psock); + ret = -EBUSY; + goto out_progs; + } + } else { + psock = sk_psock_init(sk, map->numa_node); + if (!psock) { + ret = -ENOMEM; + goto out_progs; + } + sk_psock_is_new = true; + } + + if (msg_parser) + psock_set_prog(&psock->progs.msg_parser, msg_parser); + if (sk_psock_is_new) { + ret = tcp_bpf_init(sk); + if (ret < 0) + goto out_drop; + } else { + tcp_bpf_reinit(sk); + } + + write_lock_bh(&sk->sk_callback_lock); + if (skb_progs && !psock->parser.enabled) { + ret = sk_psock_init_strp(sk, psock); + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + goto out_drop; + } + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + psock_set_prog(&psock->progs.skb_parser, skb_parser); + sk_psock_start_strp(sk, psock); + } + write_unlock_bh(&sk->sk_callback_lock); + return 0; +out_drop: + sk_psock_put(sk, psock); +out_progs: + if (msg_parser) + bpf_prog_put(msg_parser); +out: + if (skb_progs) { + bpf_prog_put(skb_verdict); + bpf_prog_put(skb_parser); + } + return ret; +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock **psk = &stab->sks[i]; + struct sock *sk; + + sk = xchg(psk, NULL); + if (sk) + sock_map_unref(sk, psk); + } + raw_spin_unlock_bh(&stab->lock); + rcu_read_unlock(); + + bpf_map_area_free(stab->sks); + kfree(stab); +} + +static void sock_map_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); +} + +static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(key >= map->max_entries)) + return NULL; + return READ_ONCE(stab->sks[key]); +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, + struct sock **psk) +{ + struct sock *sk; + + raw_spin_lock_bh(&stab->lock); + sk = *psk; + if (!sk_test || sk_test == sk) + *psk = NULL; + raw_spin_unlock_bh(&stab->lock); + if (unlikely(!sk)) + return -EINVAL; + sock_map_unref(sk, psk); + return 0; +} + +static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + __sock_map_delete(stab, sk, link_raw); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = *(u32 *)key; + struct sock **psk; + + if (unlikely(i >= map->max_entries)) + return -EINVAL; + + psk = &stab->sks[i]; + return __sock_map_delete(stab, NULL, psk); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *key_next = next; + + if (i == stab->map.max_entries - 1) + return -ENOENT; + if (i >= stab->map.max_entries) + *key_next = 0; + else + *key_next = i + 1; + return 0; +} + +static int sock_map_update_common(struct bpf_map *map, u32 idx, + struct sock *sk, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct sk_psock_link *link; + struct sk_psock *psock; + struct sock *osk; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(idx >= map->max_entries)) + return -E2BIG; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &stab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + raw_spin_lock_bh(&stab->lock); + osk = stab->sks[idx]; + if (osk && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!osk && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + sock_map_add_link(psock, link, map, &stab->sks[idx]); + stab->sks[idx] = sk; + if (osk) + sock_map_unref(osk, &stab->sks[idx]); + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&stab->lock); + if (psock) + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + u32 idx = *(u32 *)key; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_map_update_common(map, idx, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_map_update_common(map, *(u32 *)key, sops->sk, + flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_map_release_progs, + .map_check_btf = map_check_no_btf, +}; + +struct bpf_htab_elem { + struct rcu_head rcu; + u32 hash; + struct sock *sk; + struct hlist_node node; + u8 key[0]; +}; + +struct bpf_htab_bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bpf_htab_bucket *buckets; + u32 buckets_num; + u32 elem_size; + struct sk_psock_progs progs; + atomic_t count; +}; + +static inline u32 sock_hash_bucket_hash(const void *key, u32 len) +{ + return jhash(key, len, 0); +} + +static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, + u32 hash) +{ + return &htab->buckets[hash & (htab->buckets_num - 1)]; +} + +static struct bpf_htab_elem * +sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, + u32 key_size) +{ + struct bpf_htab_elem *elem; + + hlist_for_each_entry_rcu(elem, head, node) { + if (elem->hash == hash && + !memcmp(&elem->key, key, key_size)) + return elem; + } + + return NULL; +} + +static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + + return elem ? elem->sk : NULL; +} + +static void sock_hash_free_elem(struct bpf_htab *htab, + struct bpf_htab_elem *elem) +{ + atomic_dec(&htab->count); + kfree_rcu(elem, rcu); +} + +static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem_probe, *elem = link_raw; + struct bpf_htab_bucket *bucket; + + WARN_ON_ONCE(!rcu_read_lock_held()); + bucket = sock_hash_select_bucket(htab, elem->hash); + + /* elem may be deleted in parallel from the map, but access here + * is okay since it's going away only after RCU grace period. + * However, we need to check whether it's still present. + */ + raw_spin_lock_bh(&bucket->lock); + elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, + elem->key, map->key_size); + if (elem_probe && elem_probe == elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 hash, key_size = map->key_size; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + int ret = -ENOENT; + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + ret = 0; + } + raw_spin_unlock_bh(&bucket->lock); + return ret; +} + +static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_htab_elem *old) +{ + struct bpf_htab_elem *new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + + new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!new) { + atomic_dec(&htab->count); + return ERR_PTR(-ENOMEM); + } + memcpy(new->key, key, key_size); + new->sk = sk; + new->hash = hash; + return new; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_elem *elem, *elem_new; + struct bpf_htab_bucket *bucket; + struct sk_psock_link *link; + struct sk_psock *psock; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &htab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!elem && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); + if (IS_ERR(elem_new)) { + ret = PTR_ERR(elem_new); + goto out_unlock; + } + + sock_map_add_link(psock, link, map, elem_new); + /* Add new element to the head of the list, so that + * concurrent search will find it before old elem. + */ + hlist_add_head_rcu(&elem_new->node, &bucket->head); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&bucket->lock); + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static int sock_hash_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_hash_update_common(map, key, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +static int sock_hash_get_next_key(struct bpf_map *map, void *key, + void *key_next) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem, *elem_next; + u32 hash, key_size = map->key_size; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first_elem; + hash = sock_hash_bucket_hash(key, key_size); + head = &sock_hash_select_bucket(htab, hash)->head; + elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); + if (!elem) + goto find_first_elem; + + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + + i = hash & (htab->buckets_num - 1); + i++; +find_first_elem: + for (; i < htab->buckets_num; i++) { + head = &sock_hash_select_bucket(htab, i)->head; + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + } + + return -ENOENT; +} + +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size == 0 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + return ERR_PTR(-E2BIG); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct bpf_htab_elem) + + round_up(htab->map.key_size, 8); + if (htab->buckets_num == 0 || + htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { + err = -EINVAL; + goto free_htab; + } + + cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + + (u64) htab->elem_size * htab->map.max_entries; + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_htab; + } + + htab->buckets = bpf_map_area_alloc(htab->buckets_num * + sizeof(struct bpf_htab_bucket), + htab->map.numa_node); + if (!htab->buckets) { + err = -ENOMEM; + goto free_htab; + } + + for (i = 0; i < htab->buckets_num; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + struct hlist_node *node; + int i; + + synchronize_rcu(); + rcu_read_lock(); + for (i = 0; i < htab->buckets_num; i++) { + bucket = sock_hash_select_bucket(htab, i); + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry_safe(elem, node, &bucket->head, node) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + } + raw_spin_unlock_bh(&bucket->lock); + } + rcu_read_unlock(); + + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static void sock_hash_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); +} + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_hash_update_common(map, key, sops->sk, flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_hash_release_progs, + .map_check_btf = map_check_no_btf, +}; + +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return &container_of(map, struct bpf_stab, map)->progs; + case BPF_MAP_TYPE_SOCKHASH: + return &container_of(map, struct bpf_htab, map)->progs; + default: + break; + } + + return NULL; +} + +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + u32 which) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + + if (!progs) + return -EOPNOTSUPP; + + switch (which) { + case BPF_SK_MSG_VERDICT: + psock_set_prog(&progs->msg_parser, prog); + break; + case BPF_SK_SKB_STREAM_PARSER: + psock_set_prog(&progs->skb_parser, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + psock_set_prog(&progs->skb_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) +{ + switch (link->map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return sock_map_delete_from_link(link->map, sk, + link->link_raw); + case BPF_MAP_TYPE_SOCKHASH: + return sock_hash_delete_from_link(link->map, sk, + link->link_raw); + default: + break; + } +} diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 7446b98661d8..58629314eae9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c new file mode 100644 index 000000000000..80debb0daf37 --- /dev/null +++ b/net/ipv4/tcp_bpf.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/skmsg.h> +#include <linux/filter.h> +#include <linux/bpf.h> +#include <linux/init.h> +#include <linux/wait.h> + +#include <net/inet_common.h> + +static bool tcp_bpf_stream_read(const struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} + +static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + struct msghdr *msg, int len) +{ + struct iov_iter *iter = &msg->msg_iter; + int i, ret, copied = 0; + + while (copied != len) { + struct scatterlist *sge; + struct sk_msg *msg_rx; + + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + ret = copy_page_to_iter(page, sge->offset, copy, iter); + if (ret != copy) { + msg_rx->sg.start = i; + return -EFAULT; + } + + copied += copy; + sge->offset += copy; + sge->length -= copy; + sk_mem_uncharge(sk, copy); + if (!sge->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (!msg_rx->skb) + put_page(page); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + list_del(&msg_rx->list); + if (msg_rx->skb) + consume_skb(msg_rx->skb); + kfree(msg_rx); + } + } + + return copied; +} +EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); + +int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + lock_sock(sk); +msg_bytes_ready: + copied = __tcp_bpf_recvmsg(sk, psock, msg, len); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (skb_queue_empty(&sk->sk_receive_queue)) + goto msg_bytes_ready; + release_sock(sk); + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + } + if (err) { + ret = err; + goto out; + } + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} + +static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg, u32 apply_bytes, int flags) +{ + bool apply = apply_bytes; + struct scatterlist *sge; + u32 size, copied = 0; + struct sk_msg *tmp; + int i, ret = 0; + + tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!tmp)) + return -ENOMEM; + + lock_sock(sk); + tmp->sg.start = msg->sg.start; + i = msg->sg.start; + do { + sge = sk_msg_elem(msg, i); + size = (apply && apply_bytes < sge->length) ? + apply_bytes : sge->length; + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + ret = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + sk_msg_xfer(tmp, msg, i, size); + copied += size; + if (sge->length) + get_page(sk_msg_page(tmp, i)); + sk_msg_iter_var_next(i); + tmp->sg.end = i; + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != msg->sg.end); + + if (!ret) { + msg->sg.start = i; + msg->sg.size -= apply_bytes; + sk_psock_queue_msg(psock, tmp); + sk->sk_data_ready(sk); + } else { + sk_msg_free(sk, tmp); + kfree(tmp); + } + + release_sock(sk); + return ret; +} + +static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sge; + struct page *page; + int size, ret = 0; + u32 off; + + while (1) { + sge = sk_msg_elem(msg, msg->sg.start); + size = (apply && apply_bytes < sge->length) ? + apply_bytes : sge->length; + off = sge->offset; + page = sg_page(sge); + + tcp_rate_check_app_limited(sk); +retry: + ret = do_tcp_sendpages(sk, page, off, size, flags); + if (ret <= 0) + return ret; + if (apply) + apply_bytes -= ret; + msg->sg.size -= ret; + sge->offset += ret; + sge->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + if (ret != size) { + size -= ret; + off += ret; + goto retry; + } + if (!sge->length) { + put_page(page); + sk_msg_iter_next(msg, start); + sg_init_table(sge, 1); + if (msg->sg.start == msg->sg.end) + break; + } + if (apply && !apply_bytes) + break; + } + + return 0; +} + +static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, + u32 apply_bytes, int flags, bool uncharge) +{ + int ret; + + lock_sock(sk); + ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); + release_sock(sk); + return ret; +} + +int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, + u32 bytes, int flags) +{ + bool ingress = sk_msg_to_ingress(msg); + struct sk_psock *psock = sk_psock_get(sk); + int ret; + + if (unlikely(!psock)) { + sk_msg_free(sk, msg); + return 0; + } + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + tcp_bpf_push_locked(sk, msg, bytes, flags, false); + sk_psock_put(sk, psock); + return ret; +} +EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); + +static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg, int *copied, int flags) +{ + bool cork = false, enospc = msg->sg.start == msg->sg.end; + struct sock *sk_redir; + u32 tosend; + int ret; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + + if (msg->cork_bytes && + msg->cork_bytes > msg->sg.size && !enospc) { + psock->cork_bytes = msg->cork_bytes - msg->sg.size; + if (!psock->cork) { + psock->cork = kzalloc(sizeof(*psock->cork), + GFP_ATOMIC | __GFP_NOWARN); + if (!psock->cork) + return -ENOMEM; + } + memcpy(psock->cork, msg, sizeof(*msg)); + return 0; + } + + tosend = msg->sg.size; + if (psock->apply_bytes && psock->apply_bytes < tosend) + tosend = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + ret = tcp_bpf_push(sk, msg, tosend, flags, true); + if (unlikely(ret)) { + *copied -= sk_msg_free(sk, msg); + break; + } + sk_msg_apply_bytes(psock, tosend); + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + sk_msg_apply_bytes(psock, tosend); + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + sk_msg_return(sk, msg, tosend); + release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + lock_sock(sk); + if (unlikely(ret < 0)) { + int free = sk_msg_free_nocharge(sk, msg); + + if (!cork) + *copied -= free; + } + if (cork) { + sk_msg_free(sk, msg); + kfree(msg); + msg = NULL; + ret = 0; + } + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, tosend); + sk_msg_apply_bytes(psock, tosend); + *copied -= tosend; + return -EACCES; + } + + if (likely(!ret)) { + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (msg && + msg->sg.data[msg->sg.start].page_link && + msg->sg.data[msg->sg.start].length) + goto more_data; + } + return ret; +} + +static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct sk_msg tmp, *msg_tx = NULL; + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + int copied = 0, err = 0; + struct sk_psock *psock; + long timeo; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_sendmsg(sk, msg, size); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + while (msg_data_left(msg)) { + bool enospc = false; + u32 copy, osize; + + if (sk->sk_err) { + err = -sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + if (psock->cork) { + msg_tx = psock->cork; + } else { + msg_tx = &tmp; + sk_msg_init(msg_tx); + } + + osize = msg_tx->sg.size; + err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = msg_tx->sg.size - osize; + } + + err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + copy); + if (err < 0) { + sk_msg_trim(sk, msg_tx, osize); + goto out_err; + } + + copied += copy; + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + if (psock->cork_bytes && !enospc) + goto out_err; + /* All cork bytes are accounted, rerun the prog. */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) { + if (msg_tx && msg_tx != psock->cork) + sk_msg_free(sk, msg_tx); + goto out_err; + } + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); + release_sock(sk); + sk_psock_put(sk, psock); + return copied ? copied : err; +} + +static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct sk_msg tmp, *msg = NULL; + int err = 0, copied = 0; + struct sk_psock *psock; + bool enospc = false; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_sendpage(sk, page, offset, size, flags); + + lock_sock(sk); + if (psock->cork) { + msg = psock->cork; + } else { + msg = &tmp; + sk_msg_init(msg); + } + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(sk_msg_full(msg))) + goto out_err; + + sk_msg_page_add(msg, page, size, offset); + sk_mem_charge(sk, size); + copied = size; + if (sk_msg_full(msg)) + enospc = true; + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + if (psock->cork_bytes && !enospc) + goto out_err; + /* All cork bytes are accounted, rerun the prog. */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); +out_err: + release_sock(sk); + sk_psock_put(sk, psock); + return copied ? copied : err; +} + +static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + sk_psock_cork_free(psock); + __sk_psock_purge_ingress_msg(psock); + while ((link = sk_psock_link_pop(psock))) { + sk_psock_unlink(sk, link); + sk_psock_free_link(link); + } +} + +static void tcp_bpf_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + + saved_unhash = psock->saved_unhash; + tcp_bpf_remove(sk, psock); + rcu_read_unlock(); + saved_unhash(sk); +} + +static void tcp_bpf_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + + saved_close = psock->saved_close; + tcp_bpf_remove(sk, psock); + rcu_read_unlock(); + release_sock(sk); + saved_close(sk, timeout); +} + +enum { + TCP_BPF_IPV4, + TCP_BPF_IPV6, + TCP_BPF_NUM_PROTS, +}; + +enum { + TCP_BPF_BASE, + TCP_BPF_TX, + TCP_BPF_NUM_CFGS, +}; + +static struct proto *tcpv6_prot_saved __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; + +static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], + struct proto *base) +{ + prot[TCP_BPF_BASE] = *base; + prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash; + prot[TCP_BPF_BASE].close = tcp_bpf_close; + prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; + prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + + prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; + prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; +} + +static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +{ + if (sk->sk_family == AF_INET6 && + unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(ops != tcpv6_prot_saved)) { + tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); + smp_store_release(&tcpv6_prot_saved, ops); + } + spin_unlock_bh(&tcpv6_prot_lock); + } +} + +static int __init tcp_bpf_v4_build_proto(void) +{ + tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); + return 0; +} +core_initcall(tcp_bpf_v4_build_proto); + +static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + + sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); +} + +static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + + /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed + * or added requiring sk_prot hook updates. We keep original saved + * hooks in this case. + */ + sk->sk_prot = &tcp_bpf_prots[family][config]; +} + +static int tcp_bpf_assert_proto_ops(struct proto *ops) +{ + /* In order to avoid retpoline, we make assumptions when we call + * into ops if e.g. a psock is not present. Make sure they are + * indeed valid assumptions. + */ + return ops->recvmsg == tcp_recvmsg && + ops->sendmsg == tcp_sendmsg && + ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; +} + +void tcp_bpf_reinit(struct sock *sk) +{ + struct sk_psock *psock; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + tcp_bpf_reinit_sk_prot(sk, psock); + rcu_read_unlock(); +} + +int tcp_bpf_init(struct sock *sk) +{ + struct proto *ops = READ_ONCE(sk->sk_prot); + struct sk_psock *psock; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock || psock->sk_proto || + tcp_bpf_assert_proto_ops(ops))) { + rcu_read_unlock(); + return -EINVAL; + } + tcp_bpf_check_v6_needs_rebuild(sk, ops); + tcp_bpf_update_sk_prot(sk, psock); + rcu_read_unlock(); + return 0; +} diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index a5995bb2eaca..a9162aa11af9 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -6,7 +6,7 @@ * */ -#include<linux/module.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> @@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) return NULL; } -static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp) -{ - struct tcp_ulp_ops *e; - - list_for_each_entry_rcu(e, &tcp_ulp_list, list) { - if (e->uid == ulp) - return e; - } - - return NULL; -} - static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; @@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) return ulp; } -static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid) -{ - const struct tcp_ulp_ops *ulp; - - rcu_read_lock(); - ulp = tcp_ulp_find_id(uid); - if (!ulp || !try_module_get(ulp->owner)) - ulp = NULL; - rcu_read_unlock(); - return ulp; -} - /* Attach new upper layer protocol to the list * of available protocols. */ @@ -123,6 +99,8 @@ void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + sock_owned_by_me(sk); + if (!icsk->icsk_ulp_ops) return; @@ -133,54 +111,35 @@ void tcp_cleanup_ulp(struct sock *sk) icsk->icsk_ulp_ops = NULL; } -/* Change upper layer protocol for socket */ -int tcp_set_ulp(struct sock *sk, const char *name) +static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_ulp_ops *ulp_ops; - int err = 0; + int err; + err = -EEXIST; if (icsk->icsk_ulp_ops) - return -EEXIST; - - ulp_ops = __tcp_ulp_find_autoload(name); - if (!ulp_ops) - return -ENOENT; - - if (!ulp_ops->user_visible) { - module_put(ulp_ops->owner); - return -ENOENT; - } + goto out_err; err = ulp_ops->init(sk); - if (err) { - module_put(ulp_ops->owner); - return err; - } + if (err) + goto out_err; icsk->icsk_ulp_ops = ulp_ops; return 0; +out_err: + module_put(ulp_ops->owner); + return err; } -int tcp_set_ulp_id(struct sock *sk, int ulp) +int tcp_set_ulp(struct sock *sk, const char *name) { - struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_ulp_ops *ulp_ops; - int err; - if (icsk->icsk_ulp_ops) - return -EEXIST; + sock_owned_by_me(sk); - ulp_ops = __tcp_ulp_lookup(ulp); + ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; - err = ulp_ops->init(sk); - if (err) { - module_put(ulp_ops->owner); - return err; - } - - icsk->icsk_ulp_ops = ulp_ops; - return 0; + return __tcp_set_ulp(sk, ulp_ops); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e9c8cfdf4b4c..3f4d61017a69 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -901,6 +901,7 @@ static const struct ipv6_stub ipv6_stub_impl = { static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, + .udp6_lib_lookup = __udp6_lib_lookup, }; static int __init inet6_init(void) diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig index 6cff3f6d0c3a..94da19a2a220 100644 --- a/net/strparser/Kconfig +++ b/net/strparser/Kconfig @@ -1,4 +1,2 @@ - config STREAM_PARSER - tristate - default n + def_bool n diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 73f05ece53d0..99c1a19c17b1 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -8,6 +8,7 @@ config TLS select CRYPTO_AES select CRYPTO_GCM select STREAM_PARSER + select NET_SOCK_MSG default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 961b07d4d41c..276edbc04f38 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -421,7 +421,7 @@ last_record: tls_push_record_flags = flags; if (more) { tls_ctx->pending_open_record_frags = - record->num_frags; + !!record->num_frags; break; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b428069a1b05..e90b6d537077 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -620,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; - prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; - prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; + prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; @@ -724,7 +726,6 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index aa9fdce272b6..a525fc4c2a4b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -4,6 +4,7 @@ * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved. * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved. * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved. + * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -213,153 +214,89 @@ static int tls_do_decryption(struct sock *sk, return ret; } -static void trim_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, int target_size) -{ - int i = *sg_num_elem - 1; - int trim = *sg_size - target_size; - - if (trim <= 0) { - WARN_ON(trim < 0); - return; - } - - *sg_size = target_size; - while (trim >= sg[i].length) { - trim -= sg[i].length; - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - i--; - - if (i < 0) - goto out; - } - - sg[i].length -= trim; - sk_mem_uncharge(sk, trim); - -out: - *sg_num_elem = i + 1; -} - -static void trim_both_sgl(struct sock *sk, int target_size) +static void tls_trim_both_msgs(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - target_size); - + sk_msg_trim(sk, &rec->msg_plaintext, target_size); if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - - trim_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, - target_size); + sk_msg_trim(sk, &rec->msg_encrypted, target_size); } -static int alloc_encrypted_sg(struct sock *sk, int len) +static int tls_alloc_encrypted_msg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - int rc = 0; - - rc = sk_alloc_sg(sk, len, - &rec->sg_encrypted_data[1], 0, - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, 0); - - if (rc == -ENOSPC) - rec->sg_encrypted_num_elem = - ARRAY_SIZE(rec->sg_encrypted_data) - 1; + struct sk_msg *msg_en = &rec->msg_encrypted; - return rc; + return sk_msg_alloc(sk, msg_en, len, 0); } -static int move_to_plaintext_sg(struct sock *sk, int required_size) +static int tls_clone_plaintext_msg(struct sock *sk, int required) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - struct scatterlist *plain_sg = &rec->sg_plaintext_data[1]; - struct scatterlist *enc_sg = &rec->sg_encrypted_data[1]; - int enc_sg_idx = 0; + struct sk_msg *msg_pl = &rec->msg_plaintext; + struct sk_msg *msg_en = &rec->msg_encrypted; int skip, len; - if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) - return -ENOSPC; - - /* We add page references worth len bytes from enc_sg at the - * end of plain_sg. It is guaranteed that sg_encrypted_data + /* We add page references worth len bytes from encrypted sg + * at the end of plaintext sg. It is guaranteed that msg_en * has enough required room (ensured by caller). */ - len = required_size - rec->sg_plaintext_size; + len = required - msg_pl->sg.size; - /* Skip initial bytes in sg_encrypted_data to be able - * to use same offset of both plain and encrypted data. + /* Skip initial bytes in msg_en's data to be able to use + * same offset of both plain and encrypted data. */ - skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size; + skip = tls_ctx->tx.prepend_size + msg_pl->sg.size; - while (enc_sg_idx < rec->sg_encrypted_num_elem) { - if (enc_sg[enc_sg_idx].length > skip) - break; - - skip -= enc_sg[enc_sg_idx].length; - enc_sg_idx++; - } - - /* unmark the end of plain_sg*/ - sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1); - - while (len) { - struct page *page = sg_page(&enc_sg[enc_sg_idx]); - int bytes = enc_sg[enc_sg_idx].length - skip; - int offset = enc_sg[enc_sg_idx].offset + skip; - - if (bytes > len) - bytes = len; - else - enc_sg_idx++; + return sk_msg_clone(sk, msg_pl, msg_en, skip, len); +} - /* Skipping is required only one time */ - skip = 0; +static struct tls_rec *tls_get_rec(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct sk_msg *msg_pl, *msg_en; + struct tls_rec *rec; + int mem_size; - /* Increment page reference */ - get_page(page); + mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); - sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page, - bytes, offset); + rec = kzalloc(mem_size, sk->sk_allocation); + if (!rec) + return NULL; - sk_mem_charge(sk, bytes); + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; - len -= bytes; - rec->sg_plaintext_size += bytes; + sk_msg_init(msg_pl); + sk_msg_init(msg_en); - rec->sg_plaintext_num_elem++; + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_in[1]); - if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) - return -ENOSPC; - } + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); - return 0; + return rec; } -static void free_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size) +static void tls_free_rec(struct sock *sk, struct tls_rec *rec) { - int i, n = *sg_num_elem; - - for (i = 0; i < n; ++i) { - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - } - *sg_num_elem = 0; - *sg_size = 0; + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); } static void tls_free_open_rec(struct sock *sk) @@ -368,19 +305,10 @@ static void tls_free_open_rec(struct sock *sk) struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - /* Return if there is no open record */ - if (!rec) - return; - - free_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - - kfree(rec); + if (rec) { + tls_free_rec(sk, rec); + ctx->open_rec = NULL; + } } int tls_tx_records(struct sock *sk, int flags) @@ -388,6 +316,7 @@ int tls_tx_records(struct sock *sk, int flags) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; + struct sk_msg *msg_en; int tx_flags, rc = 0; if (tls_is_partially_sent_record(tls_ctx)) { @@ -407,9 +336,7 @@ int tls_tx_records(struct sock *sk, int flags) * Remove the head of tx_list */ list_del(&rec->list); - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } @@ -421,17 +348,15 @@ int tls_tx_records(struct sock *sk, int flags) else tx_flags = flags; + msg_en = &rec->msg_encrypted; rc = tls_push_sg(sk, tls_ctx, - &rec->sg_encrypted_data[1], + &msg_en->sg.data[msg_en->sg.curr], 0, tx_flags); if (rc) goto tx_err; list_del(&rec->list); - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } else { break; @@ -451,15 +376,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) struct sock *sk = req->data; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct scatterlist *sge; + struct sk_msg *msg_en; struct tls_rec *rec; bool ready = false; int pending; rec = container_of(aead_req, struct tls_rec, aead_req); + msg_en = &rec->msg_encrypted; - rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; - + sge = sk_msg_elem(msg_en, msg_en->sg.curr); + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; /* Check if error is previously set on socket */ if (err || sk->sk_err) { @@ -497,31 +425,29 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) - schedule_delayed_work(&ctx->tx_work.work, 2); + schedule_delayed_work(&ctx->tx_work.work, 1); } static int tls_do_encryption(struct sock *sk, struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, - size_t data_len) + size_t data_len, u32 start) { struct tls_rec *rec = ctx->open_rec; - struct scatterlist *plain_sg = rec->sg_plaintext_data; - struct scatterlist *enc_sg = rec->sg_encrypted_data; + struct sk_msg *msg_en = &rec->msg_encrypted; + struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc; - /* Skip the first index as it contains AAD data */ - rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size; + sge->offset += tls_ctx->tx.prepend_size; + sge->length -= tls_ctx->tx.prepend_size; - /* If it is inplace crypto, then pass same SG list as both src, dst */ - if (rec->inplace_crypto) - plain_sg = enc_sg; + msg_en->sg.curr = start; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, plain_sg, enc_sg, + aead_request_set_crypt(aead_req, rec->sg_aead_in, + rec->sg_aead_out, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -534,8 +460,8 @@ static int tls_do_encryption(struct sock *sk, rc = crypto_aead_encrypt(aead_req); if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); - rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; } if (!rc) { @@ -551,177 +477,318 @@ static int tls_do_encryption(struct sock *sk, return rc; } +static int tls_split_open_record(struct sock *sk, struct tls_rec *from, + struct tls_rec **to, struct sk_msg *msg_opl, + struct sk_msg *msg_oen, u32 split_point, + u32 tx_overhead_size, u32 *orig_end) +{ + u32 i, j, bytes = 0, apply = msg_opl->apply_bytes; + struct scatterlist *sge, *osge, *nsge; + u32 orig_size = msg_opl->sg.size; + struct scatterlist tmp = { }; + struct sk_msg *msg_npl; + struct tls_rec *new; + int ret; + + new = tls_get_rec(sk); + if (!new) + return -ENOMEM; + ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size + + tx_overhead_size, 0); + if (ret < 0) { + tls_free_rec(sk, new); + return ret; + } + + *orig_end = msg_opl->sg.end; + i = msg_opl->sg.start; + sge = sk_msg_elem(msg_opl, i); + while (apply && sge->length) { + if (sge->length > apply) { + u32 len = sge->length - apply; + + get_page(sg_page(sge)); + sg_set_page(&tmp, sg_page(sge), len, + sge->offset + apply); + sge->length = apply; + bytes += apply; + apply = 0; + } else { + apply -= sge->length; + bytes += sge->length; + } + + sk_msg_iter_var_next(i); + if (i == msg_opl->sg.end) + break; + sge = sk_msg_elem(msg_opl, i); + } + + msg_opl->sg.end = i; + msg_opl->sg.curr = i; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = 0; + msg_opl->sg.size = bytes; + + msg_npl = &new->msg_plaintext; + msg_npl->apply_bytes = apply; + msg_npl->sg.size = orig_size - bytes; + + j = msg_npl->sg.start; + nsge = sk_msg_elem(msg_npl, j); + if (tmp.length) { + memcpy(nsge, &tmp, sizeof(*nsge)); + sk_msg_iter_var_next(j); + nsge = sk_msg_elem(msg_npl, j); + } + + osge = sk_msg_elem(msg_opl, i); + while (osge->length) { + memcpy(nsge, osge, sizeof(*nsge)); + sg_unmark_end(nsge); + sk_msg_iter_var_next(i); + sk_msg_iter_var_next(j); + if (i == *orig_end) + break; + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + } + + msg_npl->sg.end = j; + msg_npl->sg.curr = j; + msg_npl->sg.copybreak = 0; + + *to = new; + return 0; +} + +static void tls_merge_open_record(struct sock *sk, struct tls_rec *to, + struct tls_rec *from, u32 orig_end) +{ + struct sk_msg *msg_npl = &from->msg_plaintext; + struct sk_msg *msg_opl = &to->msg_plaintext; + struct scatterlist *osge, *nsge; + u32 i, j; + + i = msg_opl->sg.end; + sk_msg_iter_var_prev(i); + j = msg_npl->sg.start; + + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + + if (sg_page(osge) == sg_page(nsge) && + osge->offset + osge->length == nsge->offset) { + osge->length += nsge->length; + put_page(sg_page(nsge)); + } + + msg_opl->sg.end = orig_end; + msg_opl->sg.curr = orig_end; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size; + msg_opl->sg.size += msg_npl->sg.size; + + sk_msg_free(sk, &to->msg_encrypted); + sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted); + + kfree(from); +} + static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; + struct tls_rec *rec = ctx->open_rec, *tmp = NULL; + u32 i, split_point, uninitialized_var(orig_end); + struct sk_msg *msg_pl, *msg_en; struct aead_request *req; + bool split; int rc; if (!rec) return 0; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + split_point = msg_pl->apply_bytes; + split = split_point && split_point < msg_pl->sg.size; + if (split) { + rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, + split_point, tls_ctx->tx.overhead_size, + &orig_end); + if (rc < 0) + return rc; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + } + rec->tx_flags = flags; req = &rec->aead_req; - sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem); - sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem); + i = msg_pl->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_pl, i)); + + i = msg_pl->sg.start; + sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? + &msg_en->sg.data[i] : &msg_pl->sg.data[i]); + + i = msg_en->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_en, i)); - tls_make_aad(rec->aad_space, rec->sg_plaintext_size, + i = msg_en->sg.start; + sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); + + tls_make_aad(rec->aad_space, msg_pl->sg.size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, - page_address(sg_page(&rec->sg_encrypted_data[1])) + - rec->sg_encrypted_data[1].offset, - rec->sg_plaintext_size, record_type); - - tls_ctx->pending_open_record_frags = 0; + page_address(sg_page(&msg_en->sg.data[i])) + + msg_en->sg.data[i].offset, msg_pl->sg.size, + record_type); - rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size); - if (rc == -EINPROGRESS) - return -EINPROGRESS; + tls_ctx->pending_open_record_frags = false; + rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); if (rc < 0) { - tls_err_abort(sk, EBADMSG); + if (rc != -EINPROGRESS) { + tls_err_abort(sk, EBADMSG); + if (split) { + tls_ctx->pending_open_record_frags = true; + tls_merge_open_record(sk, rec, tmp, orig_end); + } + } return rc; + } else if (split) { + msg_pl = &tmp->msg_plaintext; + msg_en = &tmp->msg_encrypted; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + tls_ctx->pending_open_record_frags = true; + ctx->open_rec = tmp; } return tls_tx_records(sk, flags); } -static int tls_sw_push_pending_record(struct sock *sk, int flags) -{ - return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); -} - -static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length, int *pages_used, - unsigned int *size_used, - struct scatterlist *to, int to_max_pages, - bool charge) +static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, + bool full_record, u8 record_type, + size_t *copied, int flags) { - struct page *pages[MAX_SKB_FRAGS]; - - size_t offset; - ssize_t copied, use; - int i = 0; - unsigned int size = *size_used; - int num_elem = *pages_used; - int rc = 0; - int maxpages; - - while (length > 0) { - i = 0; - maxpages = to_max_pages - num_elem; - if (maxpages == 0) { - rc = -EFAULT; - goto out; - } - copied = iov_iter_get_pages(from, pages, - length, - maxpages, &offset); - if (copied <= 0) { - rc = -EFAULT; - goto out; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct sk_msg msg_redir = { }; + struct sk_psock *psock; + struct sock *sk_redir; + struct tls_rec *rec; + int err = 0, send; + bool enospc; + + psock = sk_psock_get(sk); + if (!psock) + return tls_push_record(sk, flags, record_type); +more_data: + enospc = sk_msg_full(msg); + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && + !enospc && !full_record) { + err = -ENOSPC; + goto out_err; + } + msg->cork_bytes = 0; + send = msg->sg.size; + if (msg->apply_bytes && msg->apply_bytes < send) + send = msg->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = tls_push_record(sk, flags, record_type); + if (err < 0) { + *copied -= sk_msg_free(sk, msg); + tls_free_open_rec(sk); + goto out_err; } - - iov_iter_advance(from, copied); - - length -= copied; - size += copied; - while (copied) { - use = min_t(int, copied, PAGE_SIZE - offset); - - sg_set_page(&to[num_elem], - pages[i], use, offset); - sg_unmark_end(&to[num_elem]); - if (charge) - sk_mem_charge(sk, use); - - offset = 0; - copied -= use; - - ++i; - ++num_elem; + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + memcpy(&msg_redir, msg, sizeof(*msg)); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + sk_msg_return_zero(sk, msg, send); + msg->sg.size -= send; + release_sock(sk); + err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + lock_sock(sk); + if (err < 0) { + *copied -= sk_msg_free_nocharge(sk, &msg_redir); + msg->sg.size = 0; } + if (msg->sg.size == 0) + tls_free_open_rec(sk); + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, send); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + if (msg->sg.size == 0) + tls_free_open_rec(sk); + *copied -= send; + err = -EACCES; } - /* Mark the end in the last sg entry if newly added */ - if (num_elem > *pages_used) - sg_mark_end(&to[num_elem - 1]); -out: - if (rc) - iov_iter_revert(from, size - *size_used); - *size_used = size; - *pages_used = num_elem; + if (likely(!err)) { + bool reset_eval = !ctx->open_rec; - return rc; -} - -static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, - int bytes) -{ - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; - struct scatterlist *sg = &rec->sg_plaintext_data[1]; - int copy, i, rc = 0; - - for (i = tls_ctx->pending_open_record_frags; - i < rec->sg_plaintext_num_elem; ++i) { - copy = sg[i].length; - if (copy_from_iter( - page_address(sg_page(&sg[i])) + sg[i].offset, - copy, from) != copy) { - rc = -EFAULT; - goto out; + rec = ctx->open_rec; + if (rec) { + msg = &rec->msg_plaintext; + if (!msg->apply_bytes) + reset_eval = true; } - bytes -= copy; - - ++tls_ctx->pending_open_record_frags; - - if (!bytes) - break; + if (reset_eval) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (rec) + goto more_data; } - -out: - return rc; + out_err: + sk_psock_put(sk, psock); + return err; } -static struct tls_rec *get_rec(struct sock *sk) +static int tls_sw_push_pending_record(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec; - int mem_size; - - /* Return if we already have an open record */ - if (ctx->open_rec) - return ctx->open_rec; - - mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl; + size_t copied; - rec = kzalloc(mem_size, sk->sk_allocation); if (!rec) - return NULL; - - sg_init_table(&rec->sg_plaintext_data[0], - ARRAY_SIZE(rec->sg_plaintext_data)); - sg_init_table(&rec->sg_encrypted_data[0], - ARRAY_SIZE(rec->sg_encrypted_data)); - - sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space, - sizeof(rec->aad_space)); - sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space, - sizeof(rec->aad_space)); + return 0; - ctx->open_rec = rec; - rec->inplace_crypto = 1; + msg_pl = &rec->msg_plaintext; + copied = msg_pl->sg.size; + if (!copied) + return 0; - return rec; + return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA, + &copied, flags); } int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) @@ -735,6 +802,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool is_kvec = msg->msg_iter.type & ITER_KVEC; bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; + struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int required_size; int num_async = 0; @@ -772,29 +840,35 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - rec = get_rec(sk); + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto send_end; } - orig_size = rec->sg_plaintext_size; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } - required_size = rec->sg_plaintext_size + try_to_copy + + required_size = msg_pl->sg.size + try_to_copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_encrypted: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -803,17 +877,15 @@ alloc_encrypted: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - rec->sg_encrypted_size; + try_to_copy -= required_size - msg_en->sg.size; full_record = true; } if (!is_kvec && (full_record || eor) && !async_capable) { - ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy, &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - &rec->sg_plaintext_data[1], - ARRAY_SIZE(rec->sg_plaintext_data) - 1, - true); + u32 first = msg_pl->sg.end; + + ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, + msg_pl, try_to_copy); if (ret) goto fallback_to_reg_send; @@ -821,25 +893,34 @@ alloc_encrypted: num_zc++; copied += try_to_copy; - ret = tls_push_record(sk, msg->msg_flags, record_type); + + sk_msg_sg_copy_set(msg_pl, first); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret == -ENOSPC) + goto rollback_iter; else if (ret != -EAGAIN) goto send_end; } continue; - +rollback_iter: + copied -= try_to_copy; + sk_msg_sg_copy_clear(msg_pl, first); + iov_iter_revert(&msg->msg_iter, + msg_pl->sg.size - orig_size); fallback_to_reg_send: - trim_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - orig_size); + sk_msg_trim(sk, msg_pl, orig_size); } - required_size = rec->sg_plaintext_size + try_to_copy; + required_size = msg_pl->sg.size + try_to_copy; - ret = move_to_plaintext_sg(sk, required_size); + ret = tls_clone_plaintext_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto send_end; @@ -848,28 +929,36 @@ fallback_to_reg_send: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - rec->sg_plaintext_size; + try_to_copy -= required_size - msg_pl->sg.size; full_record = true; - - trim_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, - rec->sg_plaintext_size + - tls_ctx->tx.overhead_size); + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); } - ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); - if (ret) + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, + try_to_copy); + if (ret < 0) goto trim_sgl; + /* Open records defined only if successfully copied, otherwise + * we would trim the sg but not reset the open record frags. + */ + tls_ctx->pending_open_record_frags = true; copied += try_to_copy; if (full_record || eor) { - ret = tls_push_record(sk, msg->msg_flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; - else if (ret != -EAGAIN) + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; goto send_end; + } } } @@ -881,11 +970,11 @@ wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: - trim_both_sgl(sk, orig_size); + tls_trim_both_msgs(sk, orig_size); goto send_end; } - if (rec->sg_encrypted_size < required_size) + if (msg_en->sg.size < required_size) goto alloc_encrypted; } @@ -928,10 +1017,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); unsigned char record_type = TLS_RECORD_TYPE_DATA; - size_t orig_size = size; - struct scatterlist *sg; + struct sk_msg *msg_pl; struct tls_rec *rec; int num_async = 0; + size_t copied = 0; bool full_record; int record_room; int ret = 0; @@ -964,26 +1053,33 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } - rec = get_rec(sk); + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto sendpage_end; } + msg_pl = &rec->msg_plaintext; + full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + copied = 0; copy = size; if (copy >= record_room) { copy = record_room; full_record = true; } - required_size = rec->sg_plaintext_size + copy + - tls_ctx->tx.overhead_size; + + required_size = msg_pl->sg.size + copy + + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_payload: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -992,33 +1088,32 @@ alloc_payload: * actually allocated. The difference is due * to max sg elements limit */ - copy -= required_size - rec->sg_plaintext_size; + copy -= required_size - msg_pl->sg.size; full_record = true; } - get_page(page); - sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem; - sg_set_page(sg, page, copy, offset); - sg_unmark_end(sg); - - rec->sg_plaintext_num_elem++; - + sk_msg_page_add(msg_pl, page, copy, offset); sk_mem_charge(sk, copy); + offset += copy; size -= copy; - rec->sg_plaintext_size += copy; - tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem; + copied += copy; - if (full_record || eor || - rec->sg_plaintext_num_elem == - ARRAY_SIZE(rec->sg_plaintext_data) - 1) { + tls_ctx->pending_open_record_frags = true; + if (full_record || eor || sk_msg_full(msg_pl)) { rec->inplace_crypto = 0; - ret = tls_push_record(sk, flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, flags); if (ret) { if (ret == -EINPROGRESS) num_async++; - else if (ret != -EAGAIN) + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; goto sendpage_end; + } } } continue; @@ -1027,7 +1122,7 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - trim_both_sgl(sk, rec->sg_plaintext_size); + tls_trim_both_msgs(sk, msg_pl->sg.size); goto sendpage_end; } @@ -1042,24 +1137,20 @@ wait_for_memory: } } sendpage_end: - if (orig_size > size) - ret = orig_size - size; - else - ret = sk_stream_error(sk, flags, ret); - + ret = sk_stream_error(sk, flags, ret); release_sock(sk); - return ret; + return copied ? copied : ret; } -static struct sk_buff *tls_wait_data(struct sock *sk, int flags, - long timeo, int *err) +static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); - while (!(skb = ctx->recv_pkt)) { + while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) { if (sk->sk_err) { *err = sock_error(sk); return NULL; @@ -1078,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_wait_event(sk, &timeo, + ctx->recv_pkt != skb || + !sk_psock_queue_empty(psock), + &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1092,6 +1186,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from, + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, + int to_max_pages) +{ + int rc = 0, i = 0, num_elem = *pages_used, maxpages; + struct page *pages[MAX_SKB_FRAGS]; + unsigned int size = *size_used; + ssize_t copied, use; + size_t offset; + + while (length > 0) { + i = 0; + maxpages = to_max_pages - num_elem; + if (maxpages == 0) { + rc = -EFAULT; + goto out; + } + copied = iov_iter_get_pages(from, pages, + length, + maxpages, &offset); + if (copied <= 0) { + rc = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + + length -= copied; + size += copied; + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + + sg_set_page(&to[num_elem], + pages[i], use, offset); + sg_unmark_end(&to[num_elem]); + /* We do not uncharge memory from this API */ + + offset = 0; + copied -= use; + + i++; + num_elem++; + } + } + /* Mark the end in the last sg entry if newly added */ + if (num_elem > *pages_used) + sg_mark_end(&to[num_elem - 1]); +out: + if (rc) + iov_iter_revert(from, size - *size_used); + *size_used = size; + *pages_used = num_elem; + + return rc; +} + /* This function decrypts the input skb into either out_iov or in out_sg * or in skb buffers itself. The input parameter 'zc' indicates if * zero-copy mode needs to be tried or not. With zero-copy mode, either @@ -1189,9 +1341,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); *chunk = 0; - err = zerocopy_from_iter(sk, out_iov, data_len, &pages, - chunk, &sgout[1], - (n_sgout - 1), false); + err = tls_setup_from_iter(sk, out_iov, data_len, + &pages, chunk, &sgout[1], + (n_sgout - 1)); if (err < 0) goto fallback_to_reg_recv; } else if (out_sg) { @@ -1297,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; unsigned char control; struct strp_msg *rxm; struct sk_buff *skb; @@ -1312,6 +1465,7 @@ int tls_sw_recvmsg(struct sock *sk, if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + psock = sk_psock_get(sk); lock_sock(sk); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -1321,9 +1475,19 @@ int tls_sw_recvmsg(struct sock *sk, bool async = false; int chunk = 0; - skb = tls_wait_data(sk, flags, timeo, &err); - if (!skb) + skb = tls_wait_data(sk, psock, flags, timeo, &err); + if (!skb) { + if (psock) { + int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); + + if (ret > 0) { + copied += ret; + len -= ret; + continue; + } + } goto recv_end; + } rxm = strp_msg(skb); @@ -1429,6 +1593,8 @@ recv_end: } release_sock(sk); + if (psock) + sk_psock_put(sk, psock); return copied ? : err; } @@ -1451,7 +1617,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - skb = tls_wait_data(sk, flags, timeo, &err); + skb = tls_wait_data(sk, NULL, flags, timeo, &err); if (!skb) goto splice_read_end; @@ -1485,23 +1651,20 @@ splice_read_end: return copied ? : err; } -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) +bool tls_sw_stream_read(const struct sock *sk) { - unsigned int ret; - struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + bool ingress_empty = true; + struct sk_psock *psock; - /* Grab POLLOUT and POLLHUP from the underlying socket */ - ret = ctx->sk_poll(file, sock, wait); - - /* Clear POLLIN bits, and set based on recv_pkt */ - ret &= ~(POLLIN | POLLRDNORM); - if (ctx->recv_pkt) - ret |= POLLIN | POLLRDNORM; + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + ingress_empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); - return ret; + return !ingress_empty || ctx->recv_pkt; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1580,8 +1743,15 @@ static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; strp_data_ready(&ctx->strp); + + psock = sk_psock_get(sk); + if (psock && !list_empty(&psock->ingress_msg)) { + ctx->saved_data_ready(sk); + sk_psock_put(sk, psock); + } } void tls_sw_free_resources_tx(struct sock *sk) @@ -1619,25 +1789,15 @@ void tls_sw_free_resources_tx(struct sock *sk) rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - list_del(&rec->list); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { - free_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - list_del(&rec->list); + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } @@ -1829,8 +1989,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; - strp_check_rcv(&sw_ctx_rx->strp); } diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index a6258bc8ec4f..3497f2d80328 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -15,13 +15,15 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } *COMMANDS* := - { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** + | **delete** | **pin** | **help** } MAP COMMANDS ============= | **bpftool** **map { show | list }** [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] | **bpftool** **map dump** *MAP* | **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] | **bpftool** **map lookup** *MAP* **key** *DATA* @@ -36,6 +38,11 @@ MAP COMMANDS | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } | *VALUE* := { *DATA* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** } DESCRIPTION =========== @@ -47,6 +54,10 @@ DESCRIPTION Output will start with map ID followed by map type and zero or more named attributes (depending on kernel version). + **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] + Create a new map with given parameters and pin it to *bpffs* + as *FILE*. + **bpftool map dump** *MAP* Dump all entries in a given *MAP*. diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 64156a16d530..12c803003ab2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -25,6 +25,8 @@ MAP COMMANDS | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes**}] | **bpftool** **prog pin** *PROG* *FILE* | **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP* +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP* | **bpftool** **prog help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -37,6 +39,7 @@ MAP COMMANDS | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | } +| *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** } DESCRIPTION @@ -90,6 +93,14 @@ DESCRIPTION Note: *FILE* must be located in *bpffs* mount. + **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP* + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*) + to the map *MAP*. + + **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP* + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*) + from the map *MAP*. + **bpftool prog help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 8dda77daeda9..04cd4f92ab89 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -22,11 +22,11 @@ SYNOPSIS | { **-j** | **--json** } [{ **-p** | **--pretty** }] } *MAP-COMMANDS* := - { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **event_pipe** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** + | **delete** | **pin** | **event_pipe** | **help** } *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** - | **load** | **help** } + | **load** | **attach** | **detach** | **help** } *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } @@ -57,6 +57,10 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -m, --mapcompat + Allow loading maps with unknown map definitions. + + SEE ALSO ======== **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 74288a2197ab..dac7eff4c7e5 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -46,6 +46,13 @@ CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ -I$(srctree)/tools/lib/bpf \ -I$(srctree)/tools/perf CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' +ifneq ($(EXTRA_CFLAGS),) +CFLAGS += $(EXTRA_CFLAGS) +endif +ifneq ($(EXTRA_LDFLAGS),) +LDFLAGS += $(EXTRA_LDFLAGS) +endif + LIBS = -lelf -lbfd -lopcodes $(LIBBPF) INSTALL ?= install @@ -90,7 +97,7 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) - $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS) $(OUTPUT)%.o: %.c $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index df1060b852c1..c56545e87b0d 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -184,7 +184,7 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then - local c='--version --json --pretty --bpffs' + local c='--version --json --pretty --bpffs --mapcompat' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi @@ -292,6 +292,23 @@ _bpftool() fi return 0 ;; + attach|detach) + if [[ ${#words[@]} == 7 ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + + if [[ ${#words[@]} == 6 ]]; then + COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse" -- "$cur" ) ) + return 0 + fi + + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + return 0 + ;; load) local obj @@ -347,7 +364,7 @@ _bpftool() ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'dump help pin load \ + COMPREPLY=( $( compgen -W 'dump help pin attach detach load \ show list' -- "$cur" ) ) ;; esac @@ -370,6 +387,42 @@ _bpftool() ;; esac ;; + create) + case $prev in + $command) + _filedir + return 0 + ;; + type) + COMPREPLY=( $( compgen -W 'hash array prog_array \ + perf_event_array percpu_hash percpu_array \ + stack_trace cgroup_array lru_hash \ + lru_percpu_hash lpm_trie array_of_maps \ + hash_of_maps devmap sockmap cpumap xskmap \ + sockhash cgroup_storage reuseport_sockarray \ + percpu_cgroup_storage' -- \ + "$cur" ) ) + return 0 + ;; + key|value|flags|name|entries) + return 0 + ;; + dev) + _sysfs_get_netdevs + return 0 + ;; + *) + _bpftool_once_attr 'type' + _bpftool_once_attr 'key' + _bpftool_once_attr 'value' + _bpftool_once_attr 'entries' + _bpftool_once_attr 'name' + _bpftool_once_attr 'flags' + _bpftool_once_attr 'dev' + return 0 + ;; + esac + ;; lookup|getnext|delete) case $prev in $command) @@ -483,7 +536,7 @@ _bpftool() *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'delete dump getnext help \ - lookup pin event_pipe show list update' -- \ + lookup pin event_pipe show list update create' -- \ "$cur" ) ) ;; esac diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index b3a0709ea7ed..3318da8060bd 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -618,3 +618,24 @@ void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) jsonw_string_field(json_wtr, "ifname", name); jsonw_end_object(json_wtr); } + +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what) +{ + char *endptr; + + NEXT_ARGP(); + + if (*val) { + p_err("%s already specified", what); + return -1; + } + + *val = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as %s", **argv, what); + return -1; + } + NEXT_ARGP(); + + return 0; +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 79dc3f193547..828dde30e9ec 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -55,6 +55,7 @@ json_writer_t *json_wtr; bool pretty_output; bool json_output; bool show_pinned; +int bpf_flags; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; @@ -341,6 +342,7 @@ int main(int argc, char **argv) { "pretty", no_argument, NULL, 'p' }, { "version", no_argument, NULL, 'V' }, { "bpffs", no_argument, NULL, 'f' }, + { "mapcompat", no_argument, NULL, 'm' }, { 0 } }; int opt, ret; @@ -355,7 +357,7 @@ int main(int argc, char **argv) hash_init(map_table.table); opterr = 0; - while ((opt = getopt_long(argc, argv, "Vhpjf", + while ((opt = getopt_long(argc, argv, "Vhpjfm", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -379,6 +381,9 @@ int main(int argc, char **argv) case 'f': show_pinned = true; break; + case 'm': + bpf_flags = MAPS_RELAX_COMPAT; + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 40492cdc4e53..28ee769bd11b 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -74,7 +74,7 @@ #define HELP_SPEC_PROGRAM \ "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }" #define HELP_SPEC_OPTIONS \ - "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} }" + "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} | {-m|--mapcompat}" #define HELP_SPEC_MAP \ "MAP := { id MAP_ID | pinned FILE }" @@ -89,6 +89,7 @@ extern const char *bin_name; extern json_writer_t *json_wtr; extern bool json_output; extern bool show_pinned; +extern int bpf_flags; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; @@ -138,6 +139,7 @@ int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); int do_net(int argc, char **arg); +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); int map_parse_fd(int *argc, char ***argv); int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 6003e9598973..7bf38f0e152e 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -36,6 +36,7 @@ #include <fcntl.h> #include <linux/err.h> #include <linux/kernel.h> +#include <net/if.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> @@ -94,6 +95,17 @@ static bool map_is_map_of_progs(__u32 type) return type == BPF_MAP_TYPE_PROG_ARRAY; } +static int map_type_from_str(const char *type) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(map_type_name); i++) + /* Don't allow prefixing in case of possible future shadowing */ + if (map_type_name[i] && !strcmp(map_type_name[i], type)) + return i; + return -1; +} + static void *alloc_value(struct bpf_map_info *info) { if (map_is_per_cpu(info->type)) @@ -336,6 +348,25 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key, jsonw_end_object(json_wtr); } +static void print_entry_error(struct bpf_map_info *info, unsigned char *key, + const char *value) +{ + int value_size = strlen(value); + bool single_line, break_names; + + break_names = info->key_size > 16 || value_size > 16; + single_line = info->key_size + value_size <= 24 && !break_names; + + printf("key:%c", break_names ? '\n' : ' '); + fprint_hex(stdout, key, info->key_size, " "); + + printf(single_line ? " " : "\n"); + + printf("value:%c%s", break_names ? '\n' : ' ', value); + + printf("\n"); +} + static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, unsigned char *value) { @@ -658,6 +689,54 @@ static int do_show(int argc, char **argv) return errno == ENOENT ? 0 : -1; } +static int dump_map_elem(int fd, void *key, void *value, + struct bpf_map_info *map_info, struct btf *btf, + json_writer_t *btf_wtr) +{ + int num_elems = 0; + int lookup_errno; + + if (!bpf_map_lookup_elem(fd, key, value)) { + if (json_output) { + print_entry_json(map_info, key, value, btf); + } else { + if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + do_dump_btf(&d, map_info, key, value); + } else { + print_entry_plain(map_info, key, value); + } + num_elems++; + } + return num_elems; + } + + /* lookup error handling */ + lookup_errno = errno; + + if (map_is_map_of_maps(map_info->type) || + map_is_map_of_progs(map_info->type)) + return 0; + + if (json_output) { + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, map_info->key_size); + jsonw_name(json_wtr, "value"); + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); + jsonw_end_object(json_wtr); + } else { + print_entry_error(map_info, key, strerror(lookup_errno)); + } + + return 0; +} + static int do_dump(int argc, char **argv) { struct bpf_map_info info = {}; @@ -713,40 +792,7 @@ static int do_dump(int argc, char **argv) err = 0; break; } - - if (!bpf_map_lookup_elem(fd, key, value)) { - if (json_output) - print_entry_json(&info, key, value, btf); - else - if (btf) { - struct btf_dumper d = { - .btf = btf, - .jw = btf_wtr, - .is_plain_text = true, - }; - - do_dump_btf(&d, &info, key, value); - } else { - print_entry_plain(&info, key, value); - } - num_elems++; - } else if (!map_is_map_of_maps(info.type) && - !map_is_map_of_progs(info.type)) { - if (json_output) { - jsonw_name(json_wtr, "key"); - print_hex_data_json(key, info.key_size); - jsonw_name(json_wtr, "value"); - jsonw_start_object(json_wtr); - jsonw_string_field(json_wtr, "error", - "can't lookup element"); - jsonw_end_object(json_wtr); - } else { - p_info("can't lookup element with key: "); - fprint_hex(stderr, key, info.key_size, " "); - fprintf(stderr, "\n"); - } - } - + num_elems += dump_map_elem(fd, key, value, &info, btf, btf_wtr); prev_key = key; } @@ -1024,6 +1070,92 @@ static int do_pin(int argc, char **argv) return err; } +static int do_create(int argc, char **argv) +{ + struct bpf_create_map_attr attr = { NULL, }; + const char *pinfile; + int err, fd; + + if (!REQ_ARGS(7)) + return -1; + pinfile = GET_ARG(); + + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "type")) { + NEXT_ARG(); + + if (attr.map_type) { + p_err("map type already specified"); + return -1; + } + + attr.map_type = map_type_from_str(*argv); + if ((int)attr.map_type < 0) { + p_err("unrecognized map type: %s", *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "name")) { + NEXT_ARG(); + attr.name = GET_ARG(); + } else if (is_prefix(*argv, "key")) { + if (parse_u32_arg(&argc, &argv, &attr.key_size, + "key size")) + return -1; + } else if (is_prefix(*argv, "value")) { + if (parse_u32_arg(&argc, &argv, &attr.value_size, + "value size")) + return -1; + } else if (is_prefix(*argv, "entries")) { + if (parse_u32_arg(&argc, &argv, &attr.max_entries, + "max entries")) + return -1; + } else if (is_prefix(*argv, "flags")) { + if (parse_u32_arg(&argc, &argv, &attr.map_flags, + "flags")) + return -1; + } else if (is_prefix(*argv, "dev")) { + NEXT_ARG(); + + if (attr.map_ifindex) { + p_err("offload device already specified"); + return -1; + } + + attr.map_ifindex = if_nametoindex(*argv); + if (!attr.map_ifindex) { + p_err("unrecognized netdevice '%s': %s", + *argv, strerror(errno)); + return -1; + } + NEXT_ARG(); + } + } + + if (!attr.name) { + p_err("map name not specified"); + return -1; + } + + fd = bpf_create_map_xattr(&attr); + if (fd < 0) { + p_err("map create failed: %s", strerror(errno)); + return -1; + } + + err = do_pin_fd(fd, pinfile); + close(fd); + if (err) + return err; + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -1033,6 +1165,9 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" + " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" + " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" + " [dev NAME]\n" " %s %s dump MAP\n" " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" " %s %s lookup MAP key DATA\n" @@ -1047,11 +1182,17 @@ static int do_help(int argc, char **argv) " " HELP_SPEC_PROGRAM "\n" " VALUE := { DATA | MAP | PROG }\n" " UPDATE_FLAGS := { any | exist | noexist }\n" + " TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n" + " percpu_array | stack_trace | cgroup_array | lru_hash |\n" + " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" + " devmap | sockmap | cpumap | xskmap | sockhash |\n" + " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2]); return 0; } @@ -1067,6 +1208,7 @@ static const struct cmd cmds[] = { { "delete", do_delete }, { "pin", do_pin }, { "event_pipe", do_event_pipe }, + { "create", do_create }, { 0 } }; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b1cd3bc8db70..335028968dfb 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -77,6 +77,26 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", }; +static const char * const attach_type_strings[] = { + [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_MSG_VERDICT] = "msg_verdict", + [__MAX_BPF_ATTACH_TYPE] = NULL, +}; + +enum bpf_attach_type parse_attach_type(const char *str) +{ + enum bpf_attach_type type; + + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + if (attach_type_strings[type] && + is_prefix(str, attach_type_strings[type])) + return type; + } + + return __MAX_BPF_ATTACH_TYPE; +} + static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) { struct timespec real_time_ts, boot_time_ts; @@ -697,6 +717,77 @@ int map_replace_compar(const void *p1, const void *p2) return a->idx - b->idx; } +static int do_attach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, mapfd, progfd; + + if (!REQ_ARGS(5)) { + p_err("too few parameters for map attach"); + return -EINVAL; + } + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return progfd; + + attach_type = parse_attach_type(*argv); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + return -EINVAL; + } + NEXT_ARG(); + + mapfd = map_parse_fd(&argc, &argv); + if (mapfd < 0) + return mapfd; + + err = bpf_prog_attach(progfd, mapfd, attach_type, 0); + if (err) { + p_err("failed prog attach to map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + +static int do_detach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, mapfd, progfd; + + if (!REQ_ARGS(5)) { + p_err("too few parameters for map detach"); + return -EINVAL; + } + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return progfd; + + attach_type = parse_attach_type(*argv); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + return -EINVAL; + } + NEXT_ARG(); + + mapfd = map_parse_fd(&argc, &argv); + if (mapfd < 0) + return mapfd; + + err = bpf_prog_detach2(progfd, mapfd, attach_type); + if (err) { + p_err("failed prog detach from map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} static int do_load(int argc, char **argv) { enum bpf_attach_type expected_attach_type; @@ -817,7 +908,7 @@ static int do_load(int argc, char **argv) } } - obj = bpf_object__open_xattr(&attr); + obj = __bpf_object__open_xattr(&attr, bpf_flags); if (IS_ERR_OR_NULL(obj)) { p_err("failed to open object file"); goto err_free_reuse_maps; @@ -942,6 +1033,8 @@ static int do_help(int argc, char **argv) " %s %s pin PROG FILE\n" " %s %s load OBJ FILE [type TYPE] [dev NAME] \\\n" " [map { idx IDX | name NAME } MAP]\n" + " %s %s attach PROG ATTACH_TYPE MAP\n" + " %s %s detach PROG ATTACH_TYPE MAP\n" " %s %s help\n" "\n" " " HELP_SPEC_MAP "\n" @@ -953,10 +1046,12 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" + " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -968,6 +1063,8 @@ static const struct cmd cmds[] = { { "dump", do_dump }, { "pin", do_pin }, { "load", do_load }, + { "attach", do_attach }, + { "detach", do_detach }, { 0 } }; diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 6ad27257fd67..79d84413ddf2 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -69,7 +69,7 @@ FEATURE_USER = .libbpf FEATURE_TESTS = libelf libelf-mmap bpf reallocarray FEATURE_DISPLAY = libelf bpf -INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi -I$(srctree)/tools/perf +INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES) check_feat := 1 diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 87520a87a75f..69a4d40c4227 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -69,6 +69,9 @@ struct bpf_load_program_attr { __u32 prog_ifindex; }; +/* Flags to direct loading requirements */ +#define MAPS_RELAX_COMPAT 0x01 + /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE (256 * 1024) int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ceb918c14d80..bd71efcc53be 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -19,7 +19,6 @@ #include <unistd.h> #include <fcntl.h> #include <errno.h> -#include <perf-sys.h> #include <asm/unistd.h> #include <linux/err.h> #include <linux/kernel.h> @@ -27,6 +26,7 @@ #include <linux/btf.h> #include <linux/list.h> #include <linux/limits.h> +#include <linux/perf_event.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/vfs.h> @@ -169,7 +169,7 @@ static LIST_HEAD(bpf_objects_list); struct bpf_object { char license[64]; - u32 kern_version; + __u32 kern_version; struct bpf_program *programs; size_t nr_programs; @@ -540,7 +540,7 @@ static int bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) { - u32 kver; + __u32 kver; if (size != sizeof(kver)) { pr_warning("invalid kver section in %s\n", obj->path); @@ -562,8 +562,9 @@ static int compare_bpf_map(const void *_a, const void *_b) } static int -bpf_object__init_maps(struct bpf_object *obj) +bpf_object__init_maps(struct bpf_object *obj, int flags) { + bool strict = !(flags & MAPS_RELAX_COMPAT); int i, map_idx, map_def_sz, nr_maps = 0; Elf_Scn *scn; Elf_Data *data; @@ -685,7 +686,8 @@ bpf_object__init_maps(struct bpf_object *obj) "has unrecognized, non-zero " "options\n", obj->path, map_name); - return -EINVAL; + if (strict) + return -EINVAL; } } memcpy(&obj->maps[map_idx].def, def, @@ -716,7 +718,7 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx) return false; } -static int bpf_object__elf_collect(struct bpf_object *obj) +static int bpf_object__elf_collect(struct bpf_object *obj, int flags) { Elf *elf = obj->efile.elf; GElf_Ehdr *ep = &obj->efile.ehdr; @@ -843,7 +845,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) return LIBBPF_ERRNO__FORMAT; } if (obj->efile.maps_shndx >= 0) { - err = bpf_object__init_maps(obj); + err = bpf_object__init_maps(obj, flags); if (err) goto out; } @@ -1295,7 +1297,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) static int load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, const char *name, struct bpf_insn *insns, int insns_cnt, - char *license, u32 kern_version, int *pfd, int prog_ifindex) + char *license, __u32 kern_version, int *pfd, int prog_ifindex) { struct bpf_load_program_attr load_attr; char *cp, errmsg[STRERR_BUFSIZE]; @@ -1515,7 +1517,7 @@ static int bpf_object__validate(struct bpf_object *obj, bool needs_kver) static struct bpf_object * __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, - bool needs_kver) + bool needs_kver, int flags) { struct bpf_object *obj; int err; @@ -1531,7 +1533,7 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz, CHECK_ERR(bpf_object__elf_init(obj), err, out); CHECK_ERR(bpf_object__check_endianness(obj), err, out); - CHECK_ERR(bpf_object__elf_collect(obj), err, out); + CHECK_ERR(bpf_object__elf_collect(obj, flags), err, out); CHECK_ERR(bpf_object__collect_reloc(obj), err, out); CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out); @@ -1542,7 +1544,8 @@ out: return ERR_PTR(err); } -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) +struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, + int flags) { /* param validation */ if (!attr->file) @@ -1551,7 +1554,13 @@ struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) pr_debug("loading %s\n", attr->file); return __bpf_object__open(attr->file, NULL, 0, - bpf_prog_type__needs_kver(attr->prog_type)); + bpf_prog_type__needs_kver(attr->prog_type), + flags); +} + +struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) +{ + return __bpf_object__open_xattr(attr, 0); } struct bpf_object *bpf_object__open(const char *path) @@ -1584,7 +1593,7 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, pr_debug("loading object '%s' from buffer\n", name); - return __bpf_object__open(name, obj_buf, obj_buf_sz, true); + return __bpf_object__open(name, obj_buf, obj_buf_sz, true, true); } int bpf_object__unload(struct bpf_object *obj) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8af8d3663991..7e9c801a9fdd 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -61,6 +61,8 @@ struct bpf_object_open_attr { struct bpf_object *bpf_object__open(const char *path); struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr); +struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr, + int flags); struct bpf_object *bpf_object__open_buffer(void *obj_buf, size_t obj_buf_sz, const char *name); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1381ab81099c..d99dd6fc3fbe 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,7 +36,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o + test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \ + test_sk_lookup_kern.o test_xdp_vlan.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -49,7 +50,10 @@ TEST_PROGS := test_kmod.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ test_skb_cgroup_id.sh \ - test_flow_dissector.sh + test_flow_dissector.sh \ + test_xdp_vlan.sh + +TEST_PROGS_EXTENDED := with_addr.sh # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \ diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 1d407b3494f9..fda8c162d0df 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -155,6 +155,10 @@ static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, (void *) BPF_FUNC_sk_lookup_udp; static int (*bpf_sk_release)(struct bpf_sock *sk) = (void *) BPF_FUNC_sk_release; +static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) = + (void *) BPF_FUNC_skb_vlan_push; +static int (*bpf_skb_vlan_pop)(void *ctx) = + (void *) BPF_FUNC_skb_vlan_pop; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 3655508f95fd..dd49df5e2df4 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -19,3 +19,4 @@ CONFIG_CRYPTO_SHA256=m CONFIG_VXLAN=y CONFIG_GENEVE=y CONFIG_NET_CLS_FLOWER=m +CONFIG_LWTUNNEL=y diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index ac7de38e5c63..10a5fa83c75a 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -71,6 +71,7 @@ int txmsg_start; int txmsg_end; int txmsg_ingress; int txmsg_skb; +int ktls; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -92,6 +93,7 @@ static const struct option long_options[] = { {"txmsg_end", required_argument, NULL, 'e'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, {"txmsg_skb", no_argument, &txmsg_skb, 1 }, + {"ktls", no_argument, &ktls, 1 }, {0, 0, NULL, 0 } }; @@ -112,6 +114,76 @@ static void usage(char *argv[]) printf("\n"); } +#define TCP_ULP 31 +#define TLS_TX 1 +#define TLS_RX 2 +#include <linux/tls.h> + +char *sock_to_string(int s) +{ + if (s == c1) + return "client1"; + else if (s == c2) + return "client2"; + else if (s == s1) + return "server1"; + else if (s == s2) + return "server2"; + else if (s == p1) + return "peer1"; + else if (s == p2) + return "peer2"; + else + return "unknown"; +} + +static int sockmap_init_ktls(int verbose, int s) +{ + struct tls12_crypto_info_aes_gcm_128 tls_tx = { + .info = { + .version = TLS_1_2_VERSION, + .cipher_type = TLS_CIPHER_AES_GCM_128, + }, + }; + struct tls12_crypto_info_aes_gcm_128 tls_rx = { + .info = { + .version = TLS_1_2_VERSION, + .cipher_type = TLS_CIPHER_AES_GCM_128, + }, + }; + int so_buf = 6553500; + int err; + + err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls")); + if (err) { + fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err); + return -EINVAL; + } + err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx)); + if (err) { + fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err); + return -EINVAL; + } + err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx)); + if (err) { + fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err); + return -EINVAL; + } + err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf)); + if (err) { + fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err); + return -EINVAL; + } + err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf)); + if (err) { + fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err); + return -EINVAL; + } + + if (verbose) + fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s)); + return 0; +} static int sockmap_init_sockets(int verbose) { int i, err, one = 1; @@ -456,6 +528,21 @@ static int sendmsg_test(struct sockmap_options *opt) else rx_fd = p2; + if (ktls) { + /* Redirecting into non-TLS socket which sends into a TLS + * socket is not a valid test. So in this case lets not + * enable kTLS but still run the test. + */ + if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) { + err = sockmap_init_ktls(opt->verbose, rx_fd); + if (err) + return err; + } + err = sockmap_init_ktls(opt->verbose, c1); + if (err) + return err; + } + rxpid = fork(); if (rxpid == 0) { if (opt->drop_expected) @@ -907,6 +994,8 @@ static void test_options(char *options) strncat(options, "ingress,", OPTSTRING); if (txmsg_skb) strncat(options, "skb,", OPTSTRING); + if (ktls) + strncat(options, "ktls,", OPTSTRING); } static int __test_exec(int cgrp, int test, struct sockmap_options *opt) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index bc9cd8537467..cf4cd32b6772 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -48,7 +48,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 8 +#define MAX_NR_MAPS 13 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -61,10 +61,14 @@ static bool unpriv_disabled = false; struct bpf_test { const char *descr; struct bpf_insn insns[MAX_INSNS]; - int fixup_map1[MAX_FIXUPS]; - int fixup_map2[MAX_FIXUPS]; - int fixup_map3[MAX_FIXUPS]; - int fixup_map4[MAX_FIXUPS]; + int fixup_map_hash_8b[MAX_FIXUPS]; + int fixup_map_hash_48b[MAX_FIXUPS]; + int fixup_map_hash_16b[MAX_FIXUPS]; + int fixup_map_array_48b[MAX_FIXUPS]; + int fixup_map_sockmap[MAX_FIXUPS]; + int fixup_map_sockhash[MAX_FIXUPS]; + int fixup_map_xskmap[MAX_FIXUPS]; + int fixup_map_stacktrace[MAX_FIXUPS]; int fixup_prog1[MAX_FIXUPS]; int fixup_prog2[MAX_FIXUPS]; int fixup_map_in_map[MAX_FIXUPS]; @@ -876,7 +880,7 @@ static struct bpf_test tests[] = { BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map1 = { 2 }, + .fixup_map_hash_8b = { 2 }, .errstr = "invalid indirect read from stack", .result = REJECT, }, @@ -1110,7 +1114,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 invalid mem access 'map_value_or_null'", .result = REJECT, }, @@ -1127,7 +1131,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "misaligned value access", .result = REJECT, .flags = F_LOAD_WITH_STRICT_ALIGNMENT, @@ -1147,7 +1151,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 invalid mem access", .errstr_unpriv = "R0 leaks addr", .result = REJECT, @@ -1237,7 +1241,7 @@ static struct bpf_test tests[] = { BPF_FUNC_map_delete_elem), BPF_EXIT_INSN(), }, - .fixup_map1 = { 24 }, + .fixup_map_hash_8b = { 24 }, .errstr_unpriv = "R1 pointer comparison", .result_unpriv = REJECT, .result = ACCEPT, @@ -1391,7 +1395,7 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, pkt_type)), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "different pointers", .errstr_unpriv = "R1 pointer comparison", .result = REJECT, @@ -1414,7 +1418,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JA, 0, 0, -12), }, - .fixup_map1 = { 6 }, + .fixup_map_hash_8b = { 6 }, .errstr = "different pointers", .errstr_unpriv = "R1 pointer comparison", .result = REJECT, @@ -1438,7 +1442,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_JMP_IMM(BPF_JA, 0, 0, -13), }, - .fixup_map1 = { 7 }, + .fixup_map_hash_8b = { 7 }, .errstr = "different pointers", .errstr_unpriv = "R1 pointer comparison", .result = REJECT, @@ -2575,7 +2579,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr_unpriv = "R4 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, @@ -2592,7 +2596,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "invalid indirect read from stack off -8+0 size 8", .result = REJECT, }, @@ -2894,7 +2898,7 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, @@ -2934,7 +2938,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 1 }, + .fixup_map_hash_8b = { 1 }, .errstr_unpriv = "R1 pointer comparison", .result_unpriv = REJECT, .result = ACCEPT, @@ -4073,7 +4077,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 5 }, + .fixup_map_hash_8b = { 5 }, .result_unpriv = ACCEPT, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_XDP, @@ -4089,7 +4093,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 1 }, + .fixup_map_hash_8b = { 1 }, .result = REJECT, .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_XDP, @@ -4117,7 +4121,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 11 }, + .fixup_map_hash_8b = { 11 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_XDP, }, @@ -4139,7 +4143,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 7 }, + .fixup_map_hash_8b = { 7 }, .result = REJECT, .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_XDP, @@ -4161,7 +4165,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 6 }, + .fixup_map_hash_8b = { 6 }, .result = REJECT, .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_XDP, @@ -4184,7 +4188,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 5 }, + .fixup_map_hash_8b = { 5 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -4199,7 +4203,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 1 }, + .fixup_map_hash_8b = { 1 }, .result = REJECT, .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -4227,7 +4231,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 11 }, + .fixup_map_hash_8b = { 11 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -4249,7 +4253,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 7 }, + .fixup_map_hash_8b = { 7 }, .result = REJECT, .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -4271,7 +4275,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 6 }, + .fixup_map_hash_8b = { 6 }, .result = REJECT, .errstr = "invalid access to packet", .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -4542,6 +4546,85 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { + "prevent map lookup in sockmap", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .result = REJECT, + .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem", + .prog_type = BPF_PROG_TYPE_SOCK_OPS, + }, + { + "prevent map lookup in sockhash", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .result = REJECT, + .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", + .prog_type = BPF_PROG_TYPE_SOCK_OPS, + }, + { + "prevent map lookup in xskmap", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup_map_xskmap = { 3 }, + .result = REJECT, + .errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "prevent map lookup in stack trace", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup_map_stacktrace = { 3 }, + .result = REJECT, + .errstr = "cannot pass map_type 7 into func bpf_map_lookup_elem", + .prog_type = BPF_PROG_TYPE_PERF_EVENT, + }, + { + "prevent map lookup in prog array", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup_prog2 = { 3 }, + .result = REJECT, + .errstr = "cannot pass map_type 3 into func bpf_map_lookup_elem", + }, + { "valid map access into an array with a constant", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), @@ -4555,7 +4638,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, @@ -4577,7 +4660,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, @@ -4601,7 +4684,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, @@ -4629,7 +4712,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result_unpriv = REJECT, .result = ACCEPT, @@ -4649,7 +4732,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=48 size=8", .result = REJECT, }, @@ -4670,7 +4753,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 min value is outside of the array range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -4692,7 +4775,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -4717,7 +4800,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .errstr = "R0 unbounded memory access", .result_unpriv = REJECT, @@ -4744,7 +4827,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .errstr = "invalid access to map value, value_size=48 off=44 size=8", .result_unpriv = REJECT, @@ -4774,7 +4857,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3, 11 }, + .fixup_map_hash_48b = { 3, 11 }, .errstr = "R0 pointer += pointer", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -4807,7 +4890,7 @@ static struct bpf_test tests[] = { BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .fixup_map1 = { 1 }, + .fixup_map_hash_8b = { 1 }, .result = REJECT, .errstr = "cannot pass map_type 1 into func bpf_get_local_storage", .prog_type = BPF_PROG_TYPE_CGROUP_SKB, @@ -4922,7 +5005,7 @@ static struct bpf_test tests[] = { BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .fixup_map1 = { 1 }, + .fixup_map_hash_8b = { 1 }, .result = REJECT, .errstr = "cannot pass map_type 1 into func bpf_get_local_storage", .prog_type = BPF_PROG_TYPE_CGROUP_SKB, @@ -5024,7 +5107,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -5045,7 +5128,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS @@ -5066,7 +5149,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS @@ -5087,7 +5170,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS @@ -5113,7 +5196,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .result = REJECT, .errstr = "R4 !read_ok", .prog_type = BPF_PROG_TYPE_SCHED_CLS @@ -5141,7 +5224,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -5162,7 +5245,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 unbounded memory access", .result = REJECT, .errstr_unpriv = "R0 leaks addr", @@ -5412,7 +5495,7 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, - .fixup_map1 = { 2 }, + .fixup_map_hash_8b = { 2 }, .errstr_unpriv = "R2 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, @@ -5442,7 +5525,7 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, - .fixup_map1 = { 1 }, + .fixup_map_hash_8b = { 1 }, .errstr_unpriv = "R2 leaks addr into ctx", .result_unpriv = REJECT, .result = ACCEPT, @@ -5464,7 +5547,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr_unpriv = "R6 leaks addr into mem", .result_unpriv = REJECT, .result = ACCEPT, @@ -5484,7 +5567,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5503,7 +5586,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5521,7 +5604,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_trace_printk), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=0 size=0", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5541,7 +5624,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=0 size=56", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5561,7 +5644,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5585,7 +5668,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5606,7 +5689,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5626,7 +5709,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_trace_printk), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=4 size=0", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5650,7 +5733,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=4 size=52", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5672,7 +5755,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5694,7 +5777,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5719,7 +5802,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5741,7 +5824,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5761,7 +5844,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_trace_printk), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R1 min value is outside of the array range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5786,7 +5869,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=4 size=52", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5809,7 +5892,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5832,7 +5915,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R2 min value is negative", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5858,7 +5941,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5881,7 +5964,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5903,7 +5986,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_trace_printk), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R1 min value is outside of the array range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5925,7 +6008,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R1 unbounded memory access", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5951,7 +6034,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=4 size=45", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -5975,7 +6058,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -5998,7 +6081,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = REJECT, .errstr = "R1 unbounded memory access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6022,7 +6105,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6045,7 +6128,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = REJECT, .errstr = "R1 unbounded memory access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6070,7 +6153,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6094,7 +6177,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6118,7 +6201,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = REJECT, .errstr = "R1 min value is negative", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6143,7 +6226,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6167,7 +6250,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6191,7 +6274,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = REJECT, .errstr = "R1 min value is negative", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6210,7 +6293,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 8 }, + .fixup_map_hash_16b = { 3, 8 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6230,7 +6313,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_update_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 10 }, + .fixup_map_hash_16b = { 3, 10 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6250,8 +6333,8 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_update_elem), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, - .fixup_map3 = { 10 }, + .fixup_map_hash_8b = { 3 }, + .fixup_map_hash_16b = { 10 }, .result = REJECT, .errstr = "invalid access to map value, value_size=8 off=0 size=16", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6272,7 +6355,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 9 }, + .fixup_map_hash_16b = { 3, 9 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6292,7 +6375,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 9 }, + .fixup_map_hash_16b = { 3, 9 }, .result = REJECT, .errstr = "invalid access to map value, value_size=16 off=12 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6312,7 +6395,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 9 }, + .fixup_map_hash_16b = { 3, 9 }, .result = REJECT, .errstr = "invalid access to map value, value_size=16 off=-4 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6334,7 +6417,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 10 }, + .fixup_map_hash_16b = { 3, 10 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6355,7 +6438,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 10 }, + .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, .errstr = "invalid access to map value, value_size=16 off=12 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6376,7 +6459,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 10 }, + .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, .errstr = "invalid access to map value, value_size=16 off=-4 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6399,7 +6482,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 11 }, + .fixup_map_hash_16b = { 3, 11 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6419,7 +6502,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 10 }, + .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6442,7 +6525,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map3 = { 3, 11 }, + .fixup_map_hash_16b = { 3, 11 }, .result = REJECT, .errstr = "invalid access to map value, value_size=16 off=9 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6464,7 +6547,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, @@ -6485,7 +6568,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, @@ -6502,7 +6585,7 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R1 !read_ok", .errstr = "R1 !read_ok", .result = REJECT, @@ -6536,7 +6619,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_7, -4, 24), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, @@ -6564,7 +6647,7 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 4), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, @@ -6583,7 +6666,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 bitwise operator &= on pointer", .result = REJECT, }, @@ -6600,7 +6683,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 32-bit pointer arithmetic prohibited", .result = REJECT, }, @@ -6617,7 +6700,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 pointer arithmetic with /= operator", .result = REJECT, }, @@ -6634,7 +6717,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 pointer arithmetic prohibited", .errstr = "invalid mem access 'inv'", .result = REJECT, @@ -6658,7 +6741,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 invalid mem access 'inv'", .result = REJECT, }, @@ -6681,7 +6764,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, .result_unpriv = REJECT, @@ -6927,7 +7010,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -6953,7 +7036,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "invalid access to map value, value_size=48 off=0 size=49", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6981,7 +7064,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -7008,7 +7091,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R1 min value is outside of the array range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -7080,7 +7163,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_csum_diff), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -7105,7 +7188,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_csum_diff), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -7128,7 +7211,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_csum_diff), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -7209,7 +7292,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -7230,7 +7313,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -7250,7 +7333,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_probe_read), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -7325,7 +7408,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 max value is outside of the array range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -7355,7 +7438,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr = "R0 max value is outside of the array range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -7708,7 +7791,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7732,7 +7815,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7758,7 +7841,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7783,7 +7866,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7807,7 +7890,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, { @@ -7831,7 +7914,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7877,7 +7960,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, { @@ -7902,7 +7985,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7928,7 +8011,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, { @@ -7953,7 +8036,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -7980,7 +8063,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -8006,7 +8089,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -8035,7 +8118,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, }, @@ -8065,7 +8148,7 @@ static struct bpf_test tests[] = { BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, -3), BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "R0 invalid mem access 'inv'", .result = REJECT, }, @@ -8093,7 +8176,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", .result = REJECT, .result_unpriv = REJECT, @@ -8120,7 +8203,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 max value is outside of the array range", .result = REJECT, }, @@ -8145,7 +8228,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", .result = REJECT, }, @@ -8171,7 +8254,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT }, { @@ -8196,7 +8279,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "map_value pointer and 4294967295", .result = REJECT }, @@ -8222,7 +8305,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 min value is outside of the array range", .result = REJECT }, @@ -8246,7 +8329,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "value_size=8 off=1073741825", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -8271,7 +8354,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 4 }, + .fixup_map_hash_8b = { 4 }, .errstr = "value 1073741823", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -8307,7 +8390,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT }, { @@ -8346,7 +8429,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ .errstr = "R0 unbounded memory access", .result = REJECT @@ -8389,7 +8472,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ .errstr = "R0 unbounded memory access", .result = REJECT @@ -8418,7 +8501,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT }, { @@ -8445,7 +8528,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 max value is outside of the array range", .result = REJECT }, @@ -8475,7 +8558,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R0 unbounded memory access", .result = REJECT }, @@ -8495,7 +8578,7 @@ static struct bpf_test tests[] = { BPF_JMP_A(0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "map_value pointer and 2147483646", .result = REJECT }, @@ -8517,7 +8600,7 @@ static struct bpf_test tests[] = { BPF_JMP_A(0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "pointer offset 1073741822", .result = REJECT }, @@ -8538,7 +8621,7 @@ static struct bpf_test tests[] = { BPF_JMP_A(0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "pointer offset -1073741822", .result = REJECT }, @@ -8560,7 +8643,7 @@ static struct bpf_test tests[] = { BPF_JMP_A(0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "map_value pointer and 1000000000000", .result = REJECT }, @@ -8580,7 +8663,7 @@ static struct bpf_test tests[] = { BPF_JMP_A(0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .retval = POINTER_VALUE, .result_unpriv = REJECT, @@ -8601,7 +8684,7 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = ACCEPT, .retval = POINTER_VALUE, .result_unpriv = REJECT, @@ -8669,7 +8752,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 5 }, + .fixup_map_hash_8b = { 5 }, .errstr = "variable stack read R2", .result = REJECT, .prog_type = BPF_PROG_TYPE_LWT_IN, @@ -8750,7 +8833,7 @@ static struct bpf_test tests[] = { offsetof(struct test_val, foo)), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", .errstr = "R0 unbounded memory access", .result_unpriv = REJECT, @@ -10284,7 +10367,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map1 = { 16 }, + .fixup_map_hash_8b = { 16 }, .result = REJECT, .errstr = "R0 min value is outside of the array range", }, @@ -11235,7 +11318,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), /* return 0 */ }, .prog_type = BPF_PROG_TYPE_XDP, - .fixup_map1 = { 23 }, + .fixup_map_hash_8b = { 23 }, .result = ACCEPT, }, { @@ -11290,7 +11373,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), /* return 1 */ }, .prog_type = BPF_PROG_TYPE_XDP, - .fixup_map1 = { 23 }, + .fixup_map_hash_8b = { 23 }, .result = ACCEPT, }, { @@ -11345,7 +11428,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), /* return 1 */ }, .prog_type = BPF_PROG_TYPE_XDP, - .fixup_map1 = { 23 }, + .fixup_map_hash_8b = { 23 }, .result = REJECT, .errstr = "invalid read from stack off -16+0 size 8", }, @@ -11417,7 +11500,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map1 = { 12, 22 }, + .fixup_map_hash_8b = { 12, 22 }, .result = REJECT, .errstr = "invalid access to map value, value_size=8 off=2 size=8", }, @@ -11489,7 +11572,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map1 = { 12, 22 }, + .fixup_map_hash_8b = { 12, 22 }, .result = ACCEPT, }, { @@ -11560,7 +11643,7 @@ static struct bpf_test tests[] = { BPF_JMP_IMM(BPF_JA, 0, 0, -8), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map1 = { 12, 22 }, + .fixup_map_hash_8b = { 12, 22 }, .result = REJECT, .errstr = "invalid access to map value, value_size=8 off=2 size=8", }, @@ -11632,7 +11715,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map1 = { 12, 22 }, + .fixup_map_hash_8b = { 12, 22 }, .result = ACCEPT, }, { @@ -11703,7 +11786,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map1 = { 12, 22 }, + .fixup_map_hash_8b = { 12, 22 }, .result = REJECT, .errstr = "R0 invalid mem access 'inv'", }, @@ -12048,7 +12131,7 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map1 = { 13 }, + .fixup_map_hash_8b = { 13 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_XDP, }, @@ -12075,7 +12158,7 @@ static struct bpf_test tests[] = { BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, - .fixup_map2 = { 6 }, + .fixup_map_hash_48b = { 6 }, .errstr = "invalid indirect read from stack off -8+0 size 8", .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, @@ -12107,8 +12190,8 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .fixup_map2 = { 13 }, - .fixup_map4 = { 16 }, + .fixup_map_hash_48b = { 13 }, + .fixup_map_array_48b = { 16 }, .result = ACCEPT, .retval = 1, }, @@ -12140,7 +12223,7 @@ static struct bpf_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_in_map = { 16 }, - .fixup_map4 = { 13 }, + .fixup_map_array_48b = { 13 }, .result = REJECT, .errstr = "R0 invalid mem access 'map_ptr'", }, @@ -12208,7 +12291,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_6, 0, 0xdead), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "R6 invalid mem access 'inv'", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -12232,7 +12315,7 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -16), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .errstr = "invalid read from stack off -16+0 size 8", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -12354,7 +12437,7 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3), BPF_EXIT_INSN(), }, - .fixup_map1 = { 3 }, + .fixup_map_hash_8b = { 3 }, .result = REJECT, .errstr = "misaligned value access off", .prog_type = BPF_PROG_TYPE_SCHED_CLS, @@ -12464,7 +12547,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_get_stack), BPF_EXIT_INSN(), }, - .fixup_map2 = { 4 }, + .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -13511,10 +13594,14 @@ static char bpf_vlog[UINT_MAX >> 8]; static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type, struct bpf_insn *prog, int *map_fds) { - int *fixup_map1 = test->fixup_map1; - int *fixup_map2 = test->fixup_map2; - int *fixup_map3 = test->fixup_map3; - int *fixup_map4 = test->fixup_map4; + int *fixup_map_hash_8b = test->fixup_map_hash_8b; + int *fixup_map_hash_48b = test->fixup_map_hash_48b; + int *fixup_map_hash_16b = test->fixup_map_hash_16b; + int *fixup_map_array_48b = test->fixup_map_array_48b; + int *fixup_map_sockmap = test->fixup_map_sockmap; + int *fixup_map_sockhash = test->fixup_map_sockhash; + int *fixup_map_xskmap = test->fixup_map_xskmap; + int *fixup_map_stacktrace = test->fixup_map_stacktrace; int *fixup_prog1 = test->fixup_prog1; int *fixup_prog2 = test->fixup_prog2; int *fixup_map_in_map = test->fixup_map_in_map; @@ -13528,40 +13615,40 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type, * for verifier and not do a runtime lookup, so the only thing * that really matters is value size in this case. */ - if (*fixup_map1) { + if (*fixup_map_hash_8b) { map_fds[0] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long), sizeof(long long), 1); do { - prog[*fixup_map1].imm = map_fds[0]; - fixup_map1++; - } while (*fixup_map1); + prog[*fixup_map_hash_8b].imm = map_fds[0]; + fixup_map_hash_8b++; + } while (*fixup_map_hash_8b); } - if (*fixup_map2) { + if (*fixup_map_hash_48b) { map_fds[1] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long), sizeof(struct test_val), 1); do { - prog[*fixup_map2].imm = map_fds[1]; - fixup_map2++; - } while (*fixup_map2); + prog[*fixup_map_hash_48b].imm = map_fds[1]; + fixup_map_hash_48b++; + } while (*fixup_map_hash_48b); } - if (*fixup_map3) { + if (*fixup_map_hash_16b) { map_fds[2] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long), sizeof(struct other_val), 1); do { - prog[*fixup_map3].imm = map_fds[2]; - fixup_map3++; - } while (*fixup_map3); + prog[*fixup_map_hash_16b].imm = map_fds[2]; + fixup_map_hash_16b++; + } while (*fixup_map_hash_16b); } - if (*fixup_map4) { + if (*fixup_map_array_48b) { map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(struct test_val), 1); do { - prog[*fixup_map4].imm = map_fds[3]; - fixup_map4++; - } while (*fixup_map4); + prog[*fixup_map_array_48b].imm = map_fds[3]; + fixup_map_array_48b++; + } while (*fixup_map_array_48b); } if (*fixup_prog1) { @@ -13603,6 +13690,38 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type, fixup_percpu_cgroup_storage++; } while (*fixup_percpu_cgroup_storage); } + if (*fixup_map_sockmap) { + map_fds[9] = create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int), + sizeof(int), 1); + do { + prog[*fixup_map_sockmap].imm = map_fds[9]; + fixup_map_sockmap++; + } while (*fixup_map_sockmap); + } + if (*fixup_map_sockhash) { + map_fds[10] = create_map(BPF_MAP_TYPE_SOCKHASH, sizeof(int), + sizeof(int), 1); + do { + prog[*fixup_map_sockhash].imm = map_fds[10]; + fixup_map_sockhash++; + } while (*fixup_map_sockhash); + } + if (*fixup_map_xskmap) { + map_fds[11] = create_map(BPF_MAP_TYPE_XSKMAP, sizeof(int), + sizeof(int), 1); + do { + prog[*fixup_map_xskmap].imm = map_fds[11]; + fixup_map_xskmap++; + } while (*fixup_map_xskmap); + } + if (*fixup_map_stacktrace) { + map_fds[12] = create_map(BPF_MAP_TYPE_STACK_TRACE, sizeof(u32), + sizeof(u64), 1); + do { + prog[*fixup_map_stacktrace].imm = map_fds[12]; + fixup_map_stacktrace++; + } while (fixup_map_stacktrace); + } } static void do_test_single(struct bpf_test *test, bool unpriv, diff --git a/tools/testing/selftests/bpf/test_xdp_vlan.c b/tools/testing/selftests/bpf/test_xdp_vlan.c new file mode 100644 index 000000000000..365a7d2d9f5c --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_vlan.c @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2018 Jesper Dangaard Brouer. + * + * XDP/TC VLAN manipulation example + * + * GOTCHA: Remember to disable NIC hardware offloading of VLANs, + * else the VLAN tags are NOT inlined in the packet payload: + * + * # ethtool -K ixgbe2 rxvlan off + * + * Verify setting: + * # ethtool -k ixgbe2 | grep rx-vlan-offload + * rx-vlan-offload: off + * + */ +#include <stddef.h> +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/pkt_cls.h> + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +/* linux/if_vlan.h have not exposed this as UAPI, thus mirror some here + * + * struct vlan_hdr - vlan header + * @h_vlan_TCI: priority and VLAN ID + * @h_vlan_encapsulated_proto: packet type ID or len + */ +struct _vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; +#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ +#define VLAN_PRIO_SHIFT 13 +#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ +#define VLAN_TAG_PRESENT VLAN_CFI_MASK +#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ +#define VLAN_N_VID 4096 + +struct parse_pkt { + __u16 l3_proto; + __u16 l3_offset; + __u16 vlan_outer; + __u16 vlan_inner; + __u8 vlan_outer_offset; + __u8 vlan_inner_offset; +}; + +char _license[] SEC("license") = "GPL"; + +static __always_inline +bool parse_eth_frame(struct ethhdr *eth, void *data_end, struct parse_pkt *pkt) +{ + __u16 eth_type; + __u8 offset; + + offset = sizeof(*eth); + /* Make sure packet is large enough for parsing eth + 2 VLAN headers */ + if ((void *)eth + offset + (2*sizeof(struct _vlan_hdr)) > data_end) + return false; + + eth_type = eth->h_proto; + + /* Handle outer VLAN tag */ + if (eth_type == bpf_htons(ETH_P_8021Q) + || eth_type == bpf_htons(ETH_P_8021AD)) { + struct _vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + pkt->vlan_outer_offset = offset; + pkt->vlan_outer = bpf_ntohs(vlan_hdr->h_vlan_TCI) + & VLAN_VID_MASK; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + offset += sizeof(*vlan_hdr); + } + + /* Handle inner (double) VLAN tag */ + if (eth_type == bpf_htons(ETH_P_8021Q) + || eth_type == bpf_htons(ETH_P_8021AD)) { + struct _vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + pkt->vlan_inner_offset = offset; + pkt->vlan_inner = bpf_ntohs(vlan_hdr->h_vlan_TCI) + & VLAN_VID_MASK; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + offset += sizeof(*vlan_hdr); + } + + pkt->l3_proto = bpf_ntohs(eth_type); /* Convert to host-byte-order */ + pkt->l3_offset = offset; + + return true; +} + +/* Hint, VLANs are choosen to hit network-byte-order issues */ +#define TESTVLAN 4011 /* 0xFAB */ +// #define TO_VLAN 4000 /* 0xFA0 (hint 0xOA0 = 160) */ + +SEC("xdp_drop_vlan_4011") +int xdp_prognum0(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct parse_pkt pkt = { 0 }; + + if (!parse_eth_frame(data, data_end, &pkt)) + return XDP_ABORTED; + + /* Drop specific VLAN ID example */ + if (pkt.vlan_outer == TESTVLAN) + return XDP_ABORTED; + /* + * Using XDP_ABORTED makes it possible to record this event, + * via tracepoint xdp:xdp_exception like: + * # perf record -a -e xdp:xdp_exception + * # perf script + */ + return XDP_PASS; +} +/* +Commands to setup VLAN on Linux to test packets gets dropped: + + export ROOTDEV=ixgbe2 + export VLANID=4011 + ip link add link $ROOTDEV name $ROOTDEV.$VLANID type vlan id $VLANID + ip link set dev $ROOTDEV.$VLANID up + + ip link set dev $ROOTDEV mtu 1508 + ip addr add 100.64.40.11/24 dev $ROOTDEV.$VLANID + +Load prog with ip tool: + + ip link set $ROOTDEV xdp off + ip link set $ROOTDEV xdp object xdp_vlan01_kern.o section xdp_drop_vlan_4011 + +*/ + +/* Changing VLAN to zero, have same practical effect as removing the VLAN. */ +#define TO_VLAN 0 + +SEC("xdp_vlan_change") +int xdp_prognum1(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct parse_pkt pkt = { 0 }; + + if (!parse_eth_frame(data, data_end, &pkt)) + return XDP_ABORTED; + + /* Change specific VLAN ID */ + if (pkt.vlan_outer == TESTVLAN) { + struct _vlan_hdr *vlan_hdr = data + pkt.vlan_outer_offset; + + /* Modifying VLAN, preserve top 4 bits */ + vlan_hdr->h_vlan_TCI = + bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) + | TO_VLAN); + } + + return XDP_PASS; +} + +/* + * Show XDP+TC can cooperate, on creating a VLAN rewriter. + * 1. Create a XDP prog that can "pop"/remove a VLAN header. + * 2. Create a TC-bpf prog that egress can add a VLAN header. + */ + +#ifndef ETH_ALEN /* Ethernet MAC address length */ +#define ETH_ALEN 6 /* bytes */ +#endif +#define VLAN_HDR_SZ 4 /* bytes */ + +SEC("xdp_vlan_remove_outer") +int xdp_prognum2(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct parse_pkt pkt = { 0 }; + char *dest; + + if (!parse_eth_frame(data, data_end, &pkt)) + return XDP_ABORTED; + + /* Skip packet if no outer VLAN was detected */ + if (pkt.vlan_outer_offset == 0) + return XDP_PASS; + + /* Moving Ethernet header, dest overlap with src, memmove handle this */ + dest = data; + dest+= VLAN_HDR_SZ; + /* + * Notice: Taking over vlan_hdr->h_vlan_encapsulated_proto, by + * only moving two MAC addrs (12 bytes), not overwriting last 2 bytes + */ + __builtin_memmove(dest, data, ETH_ALEN * 2); + /* Note: LLVM built-in memmove inlining require size to be constant */ + + /* Move start of packet header seen by Linux kernel stack */ + bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ); + + return XDP_PASS; +} + +static __always_inline +void shift_mac_4bytes_16bit(void *data) +{ + __u16 *p = data; + + p[7] = p[5]; /* delete p[7] was vlan_hdr->h_vlan_TCI */ + p[6] = p[4]; /* delete p[6] was ethhdr->h_proto */ + p[5] = p[3]; + p[4] = p[2]; + p[3] = p[1]; + p[2] = p[0]; +} + +static __always_inline +void shift_mac_4bytes_32bit(void *data) +{ + __u32 *p = data; + + /* Assuming VLAN hdr present. The 4 bytes in p[3] that gets + * overwritten, is ethhdr->h_proto and vlan_hdr->h_vlan_TCI. + * The vlan_hdr->h_vlan_encapsulated_proto take over role as + * ethhdr->h_proto. + */ + p[3] = p[2]; + p[2] = p[1]; + p[1] = p[0]; +} + +SEC("xdp_vlan_remove_outer2") +int xdp_prognum3(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *orig_eth = data; + struct parse_pkt pkt = { 0 }; + + if (!parse_eth_frame(orig_eth, data_end, &pkt)) + return XDP_ABORTED; + + /* Skip packet if no outer VLAN was detected */ + if (pkt.vlan_outer_offset == 0) + return XDP_PASS; + + /* Simply shift down MAC addrs 4 bytes, overwrite h_proto + TCI */ + shift_mac_4bytes_32bit(data); + + /* Move start of packet header seen by Linux kernel stack */ + bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ); + + return XDP_PASS; +} + +/*===================================== + * BELOW: TC-hook based ebpf programs + * ==================================== + * The TC-clsact eBPF programs (currently) need to be attach via TC commands + */ + +SEC("tc_vlan_push") +int _tc_progA(struct __sk_buff *ctx) +{ + bpf_skb_vlan_push(ctx, bpf_htons(ETH_P_8021Q), TESTVLAN); + + return TC_ACT_OK; +} +/* +Commands to setup TC to use above bpf prog: + +export ROOTDEV=ixgbe2 +export FILE=xdp_vlan01_kern.o + +# Re-attach clsact to clear/flush existing role +tc qdisc del dev $ROOTDEV clsact 2> /dev/null ;\ +tc qdisc add dev $ROOTDEV clsact + +# Attach BPF prog EGRESS +tc filter add dev $ROOTDEV egress \ + prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push + +tc filter show dev $ROOTDEV egress +*/ diff --git a/tools/testing/selftests/bpf/test_xdp_vlan.sh b/tools/testing/selftests/bpf/test_xdp_vlan.sh new file mode 100755 index 000000000000..51a3a31d1aac --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh @@ -0,0 +1,195 @@ +#!/bin/bash + +TESTNAME=xdp_vlan + +usage() { + echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME" + echo "" + echo "Usage: $0 [-vfh]" + echo " -v | --verbose : Verbose" + echo " --flush : Flush before starting (e.g. after --interactive)" + echo " --interactive : Keep netns setup running after test-run" + echo "" +} + +cleanup() +{ + local status=$? + + if [ "$status" = "0" ]; then + echo "selftests: $TESTNAME [PASS]"; + else + echo "selftests: $TESTNAME [FAILED]"; + fi + + if [ -n "$INTERACTIVE" ]; then + echo "Namespace setup still active explore with:" + echo " ip netns exec ns1 bash" + echo " ip netns exec ns2 bash" + exit $status + fi + + set +e + ip link del veth1 2> /dev/null + ip netns del ns1 2> /dev/null + ip netns del ns2 2> /dev/null +} + +# Using external program "getopt" to get --long-options +OPTIONS=$(getopt -o hvfi: \ + --long verbose,flush,help,interactive,debug -- "$@") +if (( $? != 0 )); then + usage + echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?" + exit 2 +fi +eval set -- "$OPTIONS" + +## --- Parse command line arguments / parameters --- +while true; do + case "$1" in + -v | --verbose) + export VERBOSE=yes + shift + ;; + -i | --interactive | --debug ) + INTERACTIVE=yes + shift + ;; + -f | --flush ) + cleanup + shift + ;; + -- ) + shift + break + ;; + -h | --help ) + usage; + echo "selftests: $TESTNAME [SKIP] usage help info requested" + exit 0 + ;; + * ) + shift + break + ;; + esac +done + +if [ "$EUID" -ne 0 ]; then + echo "selftests: $TESTNAME [FAILED] need root privileges" + exit 1 +fi + +ip link set dev lo xdp off 2>/dev/null > /dev/null +if [ $? -ne 0 ];then + echo "selftests: $TESTNAME [SKIP] need ip xdp support" + exit 0 +fi + +# Interactive mode likely require us to cleanup netns +if [ -n "$INTERACTIVE" ]; then + ip link del veth1 2> /dev/null + ip netns del ns1 2> /dev/null + ip netns del ns2 2> /dev/null +fi + +# Exit on failure +set -e + +# Some shell-tools dependencies +which ip > /dev/null +which tc > /dev/null +which ethtool > /dev/null + +# Make rest of shell verbose, showing comments as doc/info +if [ -n "$VERBOSE" ]; then + set -v +fi + +# Create two namespaces +ip netns add ns1 +ip netns add ns2 + +# Run cleanup if failing or on kill +trap cleanup 0 2 3 6 9 + +# Create veth pair +ip link add veth1 type veth peer name veth2 + +# Move veth1 and veth2 into the respective namespaces +ip link set veth1 netns ns1 +ip link set veth2 netns ns2 + +# NOTICE: XDP require VLAN header inside packet payload +# - Thus, disable VLAN offloading driver features +# - For veth REMEMBER TX side VLAN-offload +# +# Disable rx-vlan-offload (mostly needed on ns1) +ip netns exec ns1 ethtool -K veth1 rxvlan off +ip netns exec ns2 ethtool -K veth2 rxvlan off +# +# Disable tx-vlan-offload (mostly needed on ns2) +ip netns exec ns2 ethtool -K veth2 txvlan off +ip netns exec ns1 ethtool -K veth1 txvlan off + +export IPADDR1=100.64.41.1 +export IPADDR2=100.64.41.2 + +# In ns1/veth1 add IP-addr on plain net_device +ip netns exec ns1 ip addr add ${IPADDR1}/24 dev veth1 +ip netns exec ns1 ip link set veth1 up + +# In ns2/veth2 create VLAN device +export VLAN=4011 +export DEVNS2=veth2 +ip netns exec ns2 ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN +ip netns exec ns2 ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN +ip netns exec ns2 ip link set $DEVNS2 up +ip netns exec ns2 ip link set $DEVNS2.$VLAN up + +# Bringup lo in netns (to avoids confusing people using --interactive) +ip netns exec ns1 ip link set lo up +ip netns exec ns2 ip link set lo up + +# At this point, the hosts cannot reach each-other, +# because ns2 are using VLAN tags on the packets. + +ip netns exec ns2 sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Okay ping fails"' + + +# Now we can use the test_xdp_vlan.c program to pop/push these VLAN tags +# ---------------------------------------------------------------------- +# In ns1: ingress use XDP to remove VLAN tags +export DEVNS1=veth1 +export FILE=test_xdp_vlan.o + +# First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change" +export XDP_PROG=xdp_vlan_change +ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG + +# In ns1: egress use TC to add back VLAN tag 4011 +# (del cmd) +# tc qdisc del dev $DEVNS1 clsact 2> /dev/null +# +ip netns exec ns1 tc qdisc add dev $DEVNS1 clsact +ip netns exec ns1 tc filter add dev $DEVNS1 egress \ + prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push + +# Now the namespaces can reach each-other, test with ping: +ip netns exec ns2 ping -W 2 -c 3 $IPADDR1 +ip netns exec ns1 ping -W 2 -c 3 $IPADDR2 + +# Second test: Replace xdp prog, that fully remove vlan header +# +# Catch kernel bug for generic-XDP, that does didn't allow us to +# remove a VLAN header, because skb->protocol still contain VLAN +# ETH_P_8021Q indication, and this cause overwriting of our changes. +# +export XDP_PROG=xdp_vlan_remove_outer2 +ip netns exec ns1 ip link set $DEVNS1 xdp off +ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG + +# Now the namespaces should still be able reach each-other, test with ping: +ip netns exec ns2 ping -W 2 -c 3 $IPADDR1 +ip netns exec ns1 ping -W 2 -c 3 $IPADDR2 |