diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 1 | ||||
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 43 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 130 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 24 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/udp.c | 32 | ||||
-rw-r--r-- | net/ipv4/udp_bpf.c | 79 |
7 files changed, 190 insertions, 122 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1355e6c0d567..f17870ee558b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, + .read_sock = udp_read_sock, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index d520e61649c8..dff4f0eb96b0 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -5,6 +5,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <net/tcp.h> #include <net/bpf_sk_storage.h> @@ -178,10 +179,52 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } +BTF_SET_START(bpf_tcp_ca_kfunc_ids) +BTF_ID(func, tcp_reno_ssthresh) +BTF_ID(func, tcp_reno_cong_avoid) +BTF_ID(func, tcp_reno_undo_cwnd) +BTF_ID(func, tcp_slow_start) +BTF_ID(func, tcp_cong_avoid_ai) +#ifdef CONFIG_DYNAMIC_FTRACE +#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) +BTF_ID(func, cubictcp_init) +BTF_ID(func, cubictcp_recalc_ssthresh) +BTF_ID(func, cubictcp_cong_avoid) +BTF_ID(func, cubictcp_state) +BTF_ID(func, cubictcp_cwnd_event) +BTF_ID(func, cubictcp_acked) +#endif +#if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP) +BTF_ID(func, dctcp_init) +BTF_ID(func, dctcp_update_alpha) +BTF_ID(func, dctcp_cwnd_event) +BTF_ID(func, dctcp_ssthresh) +BTF_ID(func, dctcp_cwnd_undo) +BTF_ID(func, dctcp_state) +#endif +#if IS_BUILTIN(CONFIG_TCP_CONG_BBR) +BTF_ID(func, bbr_init) +BTF_ID(func, bbr_main) +BTF_ID(func, bbr_sndbuf_expand) +BTF_ID(func, bbr_undo_cwnd) +BTF_ID(func, bbr_cwnd_event) +BTF_ID(func, bbr_ssthresh) +BTF_ID(func, bbr_min_tso_segs) +BTF_ID(func, bbr_set_state) +#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +BTF_SET_END(bpf_tcp_ca_kfunc_ids) + +static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) +{ + return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id); +} + static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { .get_func_proto = bpf_tcp_ca_get_func_proto, .is_valid_access = bpf_tcp_ca_is_valid_access, .btf_struct_access = bpf_tcp_ca_btf_struct_access, + .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, }; static int bpf_tcp_ca_init_member(const struct btf_type *t, diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 17c322b875fd..3d622a0d0753 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,86 +10,6 @@ #include <net/inet_common.h> #include <net/tls.h> -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) - break; - msg_rx = list_next_entry(msg_rx, list); - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); - } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -243,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -284,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -601,20 +499,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, bool restore) { + struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ffcbe46dacdb..4a30deaa9a37 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -124,7 +124,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -static void bictcp_init(struct sock *sk) +static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -137,7 +137,7 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -319,7 +319,7 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) +static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -338,7 +338,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -static u32 bictcp_recalc_ssthresh(struct sock *sk) +static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -355,7 +355,7 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static void bictcp_state(struct sock *sk, u8 new_state) +static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -442,7 +442,7 @@ static void hystart_update(struct sock *sk, u32 delay) } } -static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) +static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -471,13 +471,13 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) } static struct tcp_congestion_ops cubictcp __read_mostly = { - .init = bictcp_init, - .ssthresh = bictcp_recalc_ssthresh, - .cong_avoid = bictcp_cong_avoid, - .set_state = bictcp_state, + .init = cubictcp_init, + .ssthresh = cubictcp_recalc_ssthresh, + .cong_avoid = cubictcp_cong_avoid, + .set_state = cubictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, - .cwnd_event = bictcp_cwnd_event, - .pkts_acked = bictcp_acked, + .cwnd_event = cubictcp_cwnd_event, + .pkts_acked = cubictcp_acked, .owner = THIS_MODULE, .name = "cubic", }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..dfc6d1c0e710 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2806,6 +2806,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0695ce42dc5..bfcc7f1a8a7f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1782,6 +1782,35 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct sk_buff *skb; + int err, used; + + skb = skb_recv_udp(sk, 0, 1, &err); + if (!skb) + return err; + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } else if (used <= skb->len) { + copied += used; + } + + if (!desc->count) + break; + } + + return copied; +} +EXPORT_SYMBOL(udp_read_sock); + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -2854,6 +2883,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..7d5c4ebf42fe 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -4,6 +4,68 @@ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> +#include <net/inet_common.h> + +#include "udp_impl.h" + +static struct proto *udpv6_prot_saved __read_mostly; + +static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags, + addr_len); +#endif + return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); +} + +static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + if (sk_psock_queue_empty(psock)) { + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + if (err) { + ret = err; + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} enum { UDP_BPF_IPV4, @@ -11,7 +73,6 @@ enum { UDP_BPF_NUM_PROTS, }; -static struct proto *udpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; @@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) *prot = *base; prot->unhash = sock_map_unhash; prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -41,12 +103,23 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + struct sk_psock *psock = sk_psock(sk); + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); |