From 3f4c3127d332000530349db4843deece27fe5e0c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 10:36:01 -0700 Subject: bpf: sockmap, fix skmsg recvmsg handler to track size correctly When converting sockmap to new skmsg generic data structures we missed that the recvmsg handler did not correctly use sg.size and instead was using individual elements length. The result is if a sock is closed with outstanding data we omit the call to sk_mem_uncharge() and can get the warning below. [ 66.728282] WARNING: CPU: 6 PID: 5783 at net/core/stream.c:206 sk_stream_kill_queues+0x1fa/0x210 To fix this correct the redirect handler to xfer the size along with the scatterlist and also decrement the size from the recvmsg handler. Now when a sock is closed the remaining 'size' will be decremented with sk_mem_uncharge(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b919f0bc6d6..31df0d9fa536 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -176,6 +176,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; + dst->sg.size += size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } -- cgit v1.2.3 From 8734a162c13b1a893e7dff8de0df81fed04c51a6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:07:59 -0700 Subject: bpf: skmsg, improve sk_msg_used_element to work in cork context Currently sk_msg_used_element is only called in zerocopy context where cork is not possible and if this case happens we fallback to copy mode. However the helper is more useful if it works in all contexts. This patch resolved the case where if end == head indicating a full or empty ring the helper always reports an empty ring. To fix this add a test for the full ring case to avoid reporting a full ring has 0 elements. This additional functionality will be used in the next patches from recvmsg context where end = head with a full ring is a valid case. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 31df0d9fa536..22347b08e1f8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -187,18 +187,21 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) sk_msg_init(src); } +static inline bool sk_msg_full(const struct sk_msg *msg) +{ + return (msg->sg.end == msg->sg.start) && msg->sg.size; +} + static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { + if (sk_msg_full(msg)) + return MAX_MSG_FRAGS; + return msg->sg.end >= msg->sg.start ? msg->sg.end - msg->sg.start : msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); } -static inline bool sk_msg_full(const struct sk_msg *msg) -{ - return (msg->sg.end == msg->sg.start) && msg->sg.size; -} - static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; -- cgit v1.2.3 From 02c558b2d5d679fbbcaa5b9689484c7e0f8abb7b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:08:04 -0700 Subject: bpf: sockmap, support for msg_peek in sk_msg with redirect ingress This adds support for the MSG_PEEK flag when doing redirect to ingress and receiving on the sk_msg psock queue. Previously the flag was being ignored which could confuse applications if they expected the flag to work as normal. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 2 +- net/ipv4/tcp_bpf.c | 42 +++++++++++++++++++++++++++--------------- net/tls/tls_sw.c | 3 ++- 3 files changed, 30 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3600ae0f25c3..14fdd7ce9992 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2089,7 +2089,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len); + struct msghdr *msg, int len, int flags); /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index f9d3cf185827..b7918d4caa30 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, } int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len) + struct msghdr *msg, int len, int flags) { struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; int i, ret, copied = 0; + struct sk_msg *msg_rx; + + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); while (copied != len) { struct scatterlist *sge; - struct sk_msg *msg_rx; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); if (unlikely(!msg_rx)) break; @@ -70,22 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } copied += copy; - sge->offset += copy; - sge->length -= copy; - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - if (!sge->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!msg_rx->skb) - put_page(page); + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + sk_msg_iter_var_next(i); } if (copied == len) break; } while (i != msg_rx->sg.end); + if (unlikely(peek)) { + msg_rx = list_next_entry(msg_rx, list); + continue; + } + msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { list_del(&msg_rx->list); @@ -93,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, consume_skb(msg_rx->skb); kfree(msg_rx); } + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); } return copied; @@ -115,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len); + copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a525fc4c2a4b..5cd88ba8acd1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1478,7 +1478,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); + int ret = __tcp_bpf_recvmsg(sk, psock, + msg, len, flags); if (ret > 0) { copied += ret; -- cgit v1.2.3 From b55cbc8d9b44aaee94f19e995a5f241d453763ee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 17 Oct 2018 16:24:48 +0200 Subject: bpf: fix doc of bpf_skb_adjust_room() in uapi len_diff is signed. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") CC: Quentin Monnet Signed-off-by: Nicolas Dichtel Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. -- cgit v1.2.3 From 144991602e6a14d667b295f1b099e609ce857772 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:09 +0200 Subject: bpf: rename stack trace map operations In the following patches queue and stack maps (FIFO and LIFO datastructures) will be implemented. In order to avoid confusion and a possible name clash rename stack_map_ops to stack_trace_map_ops Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 2 +- kernel/bpf/stackmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa48343a5ea1..7bad4e1947ed 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -51,7 +51,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops) #ifdef CONFIG_PERF_EVENTS -BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b2ade10f7ec3..90daf285de03 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -600,7 +600,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, -- cgit v1.2.3 From 2ea864c58f19bf70a0e2415f9f1c53814e07f1b4 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:20 +0200 Subject: bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE ARG_PTR_TO_UNINIT_MAP_VALUE argument is a pointer to a memory zone used to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. This will be used in the following patch that implements some new helpers that receive a pointer to be filled with a map value. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e60fff48288b..0f8b863e0229 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -138,6 +138,7 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3f93a548a642..d84c91ac3b70 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2117,7 +2117,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE) { + arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) @@ -2187,7 +2188,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -2196,9 +2198,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, - NULL); + meta); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); -- cgit v1.2.3 From f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:25 +0200 Subject: bpf: add queue and stack maps Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs. These maps support peek, pop and push operations that are exposed to eBPF programs through the new bpf_map[peek/pop/push] helpers. Those operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop BPF_MAP_UPDATE_ELEM -> push Queue/stack maps are implemented using a buffer, tail and head indexes, hence BPF_F_NO_PREALLOC is not supported. As opposite to other maps, queue and stack do not use RCU for protecting maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE argument that is a pointer to a memory zone where to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. Our main motivation for implementing queue/stack maps was to keep track of a pool of elements, like network ports in a SNAT, however we forsee other use cases, like for exampling saving last N kernel events in a map and then analysing from userspace. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/bpf_types.h | 2 + include/uapi/linux/bpf.h | 29 ++++- kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 3 + kernel/bpf/helpers.c | 43 +++++++ kernel/bpf/queue_stack_maps.c | 288 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + kernel/bpf/verifier.c | 19 ++- net/core/filter.c | 6 + 10 files changed, 401 insertions(+), 3 deletions(-) create mode 100644 kernel/bpf/queue_stack_maps.c (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f8b863e0229..33014ae73103 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,9 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + int (*map_pop_elem)(struct bpf_map *map, void *value); + int (*map_peek_elem)(struct bpf_map *map, void *value); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -811,6 +814,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_map_push_elem_proto; +extern const struct bpf_func_proto bpf_map_pop_elem_proto; +extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 7bad4e1947ed..44d9ab4809bd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -69,3 +69,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e46f6732781..70082cb626b4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -128,6 +128,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -462,6 +464,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -2303,7 +2327,10 @@ union bpf_attr { FN(skb_ancestor_cgroup_id), \ FN(sk_lookup_tcp), \ FN(sk_lookup_udp), \ - FN(sk_release), + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index ff8262626b8f..4c2fa3ac56f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index defcf4df6d91..7c7eeea8cffc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6502115e8f55..ab0d5e3f9892 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ + return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { + .func = bpf_map_push_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include +#include +#include +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { + struct bpf_map map; + raw_spinlock_t lock; + u32 head, tail; + u32 size; /* max_entries + 1 */ + + char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ + return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ + return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ + u32 head = qs->head + 1; + + if (unlikely(head >= qs->size)) + head = 0; + + return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 0 || + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ + int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_queue_stack *qs; + u32 size, value_size; + u64 queue_size, cost; + + size = attr->max_entries + 1; + value_size = attr->value_size; + + queue_size = sizeof(*qs) + (u64) value_size * size; + + cost = queue_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); + + qs = bpf_map_area_alloc(queue_size, numa_node); + if (!qs) + return ERR_PTR(-ENOMEM); + + memset(qs, 0, sizeof(*qs)); + + bpf_map_init_from_attr(&qs->map, attr); + + qs->map.pages = cost; + qs->size = size; + + raw_spin_lock_init(&qs->lock); + + return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + ptr = &qs->elements[qs->tail * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) { + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + u32 index; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + index = qs->head - 1; + if (unlikely(index >= qs->size)) + index = qs->size - 1; + + ptr = &qs->elements[index * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) + qs->head = index; + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long irq_flags; + int err = 0; + void *dst; + + /* BPF_EXIST is used to force making room for a new element in case the + * map is full + */ + bool replace = (flags & BPF_EXIST); + + /* Check supported flags for queue and stack maps */ + if (flags & BPF_NOEXIST || flags > BPF_EXIST) + return -EINVAL; + + raw_spin_lock_irqsave(&qs->lock, irq_flags); + + if (queue_stack_map_is_full(qs)) { + if (!replace) { + err = -E2BIG; + goto out; + } + /* advance tail pointer to overwrite oldest element */ + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + + dst = &qs->elements[qs->head * qs->map.value_size]; + memcpy(dst, value, qs->map.value_size); + + if (unlikely(++qs->head >= qs->size)) + qs->head = 0; + +out: + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = queue_map_pop_elem, + .map_peek_elem = queue_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = stack_map_pop_elem, + .map_peek_elem = stack_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 78d9dd95e25f..1617407f9ee5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -727,6 +727,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -857,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr) /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d84c91ac3b70..7d6d9cf9ebd5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2324,6 +2324,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -2380,6 +2387,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_pop_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK) + goto error; + break; default: break; } @@ -2675,7 +2689,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && func_id != BPF_FUNC_map_update_elem && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_map_push_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_peek_elem) return 0; if (meta->map_ptr == NULL) { diff --git a/net/core/filter.c b/net/core/filter.c index 1a3ac6c46873..ea48ec789b5c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4876,6 +4876,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: -- cgit v1.2.3 From bd513cd08f10cbe28856f99ae951e86e86803861 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:30 +0200 Subject: bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall The previous patch implemented a bpf queue/stack maps that provided the peek/pop/push functions. There is not a direct relationship between those functions and the current maps syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, this is mapped to the pop operation in the queue/stack maps and it is still to implement in other kind of maps. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 70082cb626b4..a2fb333290dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1617407f9ee5..49ae64a26562 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -999,6 +999,69 @@ err_put: return err; } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); + int ufd = attr->map_fd; + struct bpf_map *map; + void *key, *value, *ptr; + u32 value_size; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + key = __bpf_copy_key(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + + value_size = map->value_size; + + err = -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_pop_elem(map, value); + } else { + err = -ENOTSUPP; + } + + if (err) + goto free_value; + + if (copy_to_user(uvalue, value, value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2472,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr); break; + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + err = map_lookup_and_delete_elem(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From b39b5f411dcfce28ff954e5d6acb2c11be3cb0ec Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 19 Oct 2018 09:57:57 -0700 Subject: bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB BPF programs of BPF_PROG_TYPE_CGROUP_SKB need to access headers in the skb. This patch enables direct access of skb for these programs. Two helper functions bpf_compute_and_save_data_end() and bpf_restore_data_end() are introduced. There are used in __cgroup_bpf_run_filter_skb(), to compute proper data_end for the BPF program, and restore original data afterwards. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 21 +++++++++++++++++++++ kernel/bpf/cgroup.c | 6 ++++++ net/core/filter.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 5771874bc01e..91b4c934f02e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -548,6 +548,27 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) cb->data_end = skb->data + skb_headlen(skb); } +/* Similar to bpf_compute_data_pointers(), except that save orginal + * data in cb->data and cb->meta_data for restore. + */ +static inline void bpf_compute_and_save_data_end( + struct sk_buff *skb, void **saved_data_end) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + + *saved_data_end = cb->data_end; + cb->data_end = skb->data + skb_headlen(skb); +} + +/* Restore data saved by bpf_compute_data_pointers(). */ +static inline void bpf_restore_data_end( + struct sk_buff *skb, void *saved_data_end) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + + cb->data_end = saved_data_end; +} + static inline u8 *bpf_skb_cb(struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 00f6ed2e4f9a..9425c2fb872f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; + void *saved_data_end; struct cgroup *cgrp; int ret; @@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, save_sk = skb->sk; skb->sk = sk; __skb_push(skb, offset); + + /* compute pointers for the bpf prog */ + bpf_compute_and_save_data_end(skb, &saved_data_end); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, bpf_prog_run_save_cb); + bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; return ret == 1 ? 0 : -EPERM; diff --git a/net/core/filter.c b/net/core/filter.c index ea48ec789b5c..5fd5139e8638 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5352,6 +5352,40 @@ static bool sk_filter_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +static bool cg_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): + return false; + } + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -7044,7 +7078,7 @@ const struct bpf_prog_ops xdp_prog_ops = { const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, - .is_valid_access = sk_filter_is_valid_access, + .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -- cgit v1.2.3 From 5032d079909d1ac5c2535acc32d5f01cd245d8ea Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 18 Oct 2018 13:58:35 -0700 Subject: bpf: skmsg, fix psock create on existing kcm/tls port Before using the psock returned by sk_psock_get() when adding it to a sockmap we need to ensure it is actually a sockmap based psock. Previously we were only checking this after incrementing the reference counter which was an error. This resulted in a slab-out-of-bounds error when the psock was not actually a sockmap type. This moves the check up so the reference counter is only used if it is a sockmap psock. Eric reported the following KASAN BUG, BUG: KASAN: slab-out-of-bounds in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline] BUG: KASAN: slab-out-of-bounds in refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120 Read of size 4 at addr ffff88019548be58 by task syz-executor4/22387 CPU: 1 PID: 22387 Comm: syz-executor4 Not tainted 4.19.0-rc7+ #264 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline] refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120 sk_psock_get include/linux/skmsg.h:379 [inline] sock_map_link.isra.6+0x41f/0xe30 net/core/sock_map.c:178 sock_hash_update_common+0x19b/0x11e0 net/core/sock_map.c:669 sock_hash_update_elem+0x306/0x470 net/core/sock_map.c:738 map_update_elem+0x819/0xdf0 kernel/bpf/syscall.c:818 Signed-off-by: John Fastabend Reported-by: Eric Dumazet Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 25 ++++++++++++++++++++----- net/core/sock_map.c | 11 ++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22347b08e1f8..84e18863f6a4 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -270,11 +270,6 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } -static inline bool sk_has_psock(struct sock *sk) -{ - return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg; -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -374,6 +369,26 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, return test_bit(bit, &psock->state); } +static inline struct sk_psock *sk_psock_get_checked(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) { + if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + if (!refcount_inc_not_zero(&psock->refcnt)) + psock = ERR_PTR(-EBUSY); + } +out: + rcu_read_unlock(); + return psock; +} + static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3c0e44cb811a..be6092ac69f8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -175,12 +175,13 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } } - psock = sk_psock_get(sk); + psock = sk_psock_get_checked(sk); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + if (psock) { - if (!sk_has_psock(sk)) { - ret = -EBUSY; - goto out_progs; - } if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (skb_progs && READ_ONCE(psock->progs.skb_parser))) { sk_psock_put(sk, psock); -- cgit v1.2.3 From 6fff607e2f14bd7c63c06c464a6f93b8efbabe28 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Oct 2018 19:56:49 -0700 Subject: bpf: sk_msg program helper bpf_msg_push_data This allows user to push data into a msg using sk_msg program types. The format is as follows, bpf_msg_push_data(msg, offset, len, flags) this will insert 'len' bytes at offset 'offset'. For example to prepend 10 bytes at the front of the message the user can, bpf_msg_push_data(msg, 0, 10, 0); This will invalidate data bounds so BPF user will have to then recheck data bounds after calling this. After this the msg size will have been updated and the user is free to write into the added bytes. We allow any offset/len as long as it is within the (data, data_end) range. However, a copy will be required if the ring is full and its possible for the helper to fail with ENOMEM or EINVAL errors which need to be handled by the BPF program. This can be used similar to XDP metadata to pass data between sk_msg layer and lower layers. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 5 ++ include/uapi/linux/bpf.h | 20 ++++++- net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84e18863f6a4..2a11e9d91dfa 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -207,6 +207,11 @@ static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) return &msg->sg.data[which]; } +static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) +{ + return msg->sg.data[which]; +} + static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a2fb333290dc..852dc17ab47a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2240,6 +2240,23 @@ union bpf_attr { * pointer that was returned from bpf_sk_lookup_xxx\ (). * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2331,7 +2348,8 @@ union bpf_attr { FN(sk_release), \ FN(map_push_elem), \ FN(map_pop_elem), \ - FN(map_peek_elem), + FN(map_peek_elem), \ + FN(msg_push_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 5fd5139e8638..35c6933c2622 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2297,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start >= offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + } else if (start - offset) { + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sk_msg_iter_next(msg, end); + } + + /* Slot(s) to place newly allocated data */ + new = i; + + /* Shift one or two slots as needed */ + if (!copy) { + sge = sk_msg_elem_cpy(msg, i); + + sk_msg_iter_var_next(i); + sg_unmark_end(&sge); + sk_msg_iter_next(msg, end); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + } + + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + msg->sg.copy[new] = false; + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -4854,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || + func == bpf_msg_push_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5130,6 +5262,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: -- cgit v1.2.3 From c16ee04c9b305d57719344922c4d48379e206a79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:23 +0200 Subject: ulp: remove uid and user_visible members They are not used anymore and therefore should be removed. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 7 ------- net/tls/tls_main.c | 2 -- 2 files changed, 9 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 14fdd7ce9992..8a61c3e8c15d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2051,11 +2051,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) -enum { - TCP_ULP_TLS, - TCP_ULP_BPF, -}; - struct tcp_ulp_ops { struct list_head list; @@ -2064,9 +2059,7 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); - int uid; char name[TCP_ULP_NAME_MAX]; - bool user_visible; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e90b6d537077..311cec8e533d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -715,8 +715,6 @@ EXPORT_SYMBOL(tls_unregister_device); static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", - .uid = TCP_ULP_TLS, - .user_visible = true, .owner = THIS_MODULE, .init = tls_init, }; -- cgit v1.2.3