From 13c7ba0c8494a4fcee9e8cc163ae332d0480bde5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Nov 2020 11:05:03 -0800 Subject: mptcp: adjust mptcp receive buffer limit if subflow has larger one In addition to tcp autotuning during read, it may also increase the receive buffer in tcp_clamp_window(). In this case, mptcp should adjust its receive buffer size as well so it can move all pending skbs from the subflow socket to the mptcp socket. At this time, TCP can have more skbs ready for processing than what the mptcp receive buffer size allows. In the mptcp case, the receive window announced is based on the free space of the mptcp parent socket instead of the individual subflows. Following the subflow allows mptcp to grow its receive buffer. This is especially noticeable for loopback traffic where two skbs are enough to fill the initial receive window. In mptcp_data_ready() we do not hold the mptcp socket lock, so modifying mptcp_sk->sk_rcvbuf is racy. Do it when moving skbs from subflow to mptcp socket, both sockets are locked in this case. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e7419fd15d84..e010ef7585bf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -466,6 +466,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct tcp_sock *tp; u32 old_copied_seq; bool done = false; + int sk_rbuf; + + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + + if (unlikely(ssk_rbuf > sk_rbuf)) { + WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); + sk_rbuf = ssk_rbuf; + } + } pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); @@ -528,7 +540,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } @@ -622,6 +634,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + int sk_rbuf, ssk_rbuf; bool wake; /* move_skbs_to_msk below can legitly clear the data_avail flag, @@ -632,12 +645,16 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (wake) set_bit(MPTCP_DATA_READY, &msk->flags); - if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && - move_skbs_to_msk(msk, ssk)) + ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + if (unlikely(ssk_rbuf > sk_rbuf)) + sk_rbuf = ssk_rbuf; + + /* over limit? can't append more skbs to msk */ + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) goto wake; - /* don't schedule if mptcp sk is (still) over limit */ - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) + if (move_skbs_to_msk(msk, ssk)) goto wake; /* mptcp socket is owned, release_cb should retry */ -- cgit v1.2.3 From 65f49fe72f9e4063b96c78de62f3ff9a3384164e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Nov 2020 11:05:04 -0800 Subject: mptcp: use _fast lock version in __mptcp_move_skbs The function is short and won't sleep, so this can use the _fast version. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e010ef7585bf..f5bacfc55006 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1493,13 +1493,14 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); + bool slowpath; if (!ssk) break; - lock_sock(ssk); + slowpath = lock_sock_fast(ssk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - release_sock(ssk); + unlock_sock_fast(ssk, slowpath); } while (!done); if (mptcp_ofo_queue(msk) || moved > 0) { -- cgit v1.2.3 From 5a369ca64364b49caff424d2f988901bc7658b6d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 3 Nov 2020 11:05:05 -0800 Subject: tcp: propagate MPTCP skb extensions on xmit splits When the TCP stack splits a packet on the write queue, the tail half currently lose the associated skb extensions, and will not carry the DSM on the wire. The above does not cause functional problems and is allowed by the RFC, but interact badly with GRO and RX coalescing, as possible candidates for aggregation will carry different TCP options. This change tries to improve the MPTCP behavior, propagating the skb extensions on split. Additionally, we must prevent the MPTCP stack from updating the mapping after the split occur: that will both violate the RFC and fool the reader. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 21 ++++++++++++++++++++- net/ipv4/tcp_output.c | 3 +++ net/mptcp/protocol.c | 7 +++++-- 3 files changed, 28 insertions(+), 3 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 753ba7e755d6..6e706d838e4e 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -29,7 +29,8 @@ struct mptcp_ext { use_ack:1, ack64:1, mpc_map:1, - __unused:2; + frozen:1, + __unused:1; /* one byte hole */ }; @@ -106,6 +107,19 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to, from->active_extensions = 0; } +static inline void mptcp_skb_ext_copy(struct sk_buff *to, + struct sk_buff *from) +{ + struct mptcp_ext *from_ext; + + from_ext = skb_ext_find(from, SKB_EXT_MPTCP); + if (!from_ext) + return; + + from_ext->frozen = 1; + skb_ext_copy(to, from); +} + static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { @@ -193,6 +207,11 @@ static inline void mptcp_skb_ext_move(struct sk_buff *to, { } +static inline void mptcp_skb_ext_copy(struct sk_buff *to, + struct sk_buff *from) +{ +} + static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bf48cd73e967..9abf5a0358d5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1569,6 +1569,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2123,6 +2124,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); + mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); @@ -2393,6 +2395,7 @@ static int tcp_mtu_probe(struct sock *sk) skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); + mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f5bacfc55006..25d12183d1ca 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -771,8 +771,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq, if (!tcp_skb_can_collapse_to(skb)) return false; - /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == write_seq; + /* can collapse only if MPTCP level sequence is in order and this + * mapping has not been xmitted yet + */ + return mpext && mpext->data_seq + mpext->data_len == write_seq && + !mpext->frozen; } static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, -- cgit v1.2.3 From 95ed690ebc72ad6c89068f08197b51fe4d3c3b48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Nov 2020 11:05:06 -0800 Subject: mptcp: split mptcp_clean_una function mptcp_clean_una() will wake writers in case memory could be reclaimed. When called from mptcp_sendmsg the wakeup code isn't needed. Move the wakeup to a new helper and then use that from the mptcp worker. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 25d12183d1ca..b84b84adc9ad 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -853,19 +853,25 @@ static void mptcp_clean_una(struct sock *sk) } out: - if (cleaned) { + if (cleaned) sk_mem_reclaim_partial(sk); +} - /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); - smp_mb__after_atomic(); +static void mptcp_clean_una_wakeup(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ - sk_stream_write_space(sk); - } + mptcp_clean_una(sk); + + /* Only wake up writers if a subflow is ready */ + if (mptcp_is_writeable(msk)) { + set_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + + /* set SEND_SPACE before sk_stream_write_space clears + * NOSPACE + */ + sk_stream_write_space(sk); } } @@ -1769,7 +1775,7 @@ static void mptcp_worker(struct work_struct *work) long timeo = 0; lock_sock(sk); - mptcp_clean_una(sk); + mptcp_clean_una_wakeup(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) -- cgit v1.2.3 From 93f323b9cccc1fc77660de49faefd11fdfd55017 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 3 Nov 2020 11:05:07 -0800 Subject: mptcp: add a new sysctl add_addr_timeout This patch added a new sysctl, named add_addr_timeout, to control the timeout value (in seconds) of the ADD_ADDR retransmission. Suggested-by: Matthieu Baerts Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 14 ++++++++++++++ net/mptcp/pm_netlink.c | 8 ++++++-- net/mptcp/protocol.h | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 54b888f94009..96ba616f59bf 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -18,6 +18,7 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; int mptcp_enabled; + unsigned int add_addr_timeout; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net) return mptcp_get_pernet(net)->mptcp_enabled; } +unsigned int mptcp_get_add_addr_timeout(struct net *net) +{ + return mptcp_get_pernet(net)->add_addr_timeout; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = { */ .proc_handler = proc_dointvec, }, + { + .procname = "add_addr_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, {} }; static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; } static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) } table[0].data = &pernet->mptcp_enabled; + table[1].data = &pernet->add_addr_timeout; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 0d6f3d912891..ed60538df7b2 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -206,6 +206,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); pr_debug("msk=%p", msk); @@ -232,7 +233,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer) } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(net)); spin_unlock_bh(&msk->pm.lock); @@ -264,6 +266,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); if (lookup_anno_list_by_saddr(msk, &entry->addr)) return false; @@ -279,7 +282,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); return true; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 13ab89dc1914..278c88c405e8 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -362,6 +362,7 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) } int mptcp_is_enabled(struct net *net); +unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); -- cgit v1.2.3 From 724d06b437bba9faf280842a983e69c04f244767 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 10 Nov 2020 11:01:43 +0800 Subject: mptcp: fix static checker warnings in mptcp_pm_add_timer Fix the following Smatch complaint: net/mptcp/pm_netlink.c:213 mptcp_pm_add_timer() warn: variable dereferenced before check 'msk' (see line 208) net/mptcp/pm_netlink.c 207 struct mptcp_sock *msk = entry->sock; 208 struct sock *sk = (struct sock *)msk; 209 struct net *net = sock_net(sk); ^^ "msk" dereferenced here. 210 211 pr_debug("msk=%p", msk); 212 213 if (!msk) ^^^^ Too late. 214 return; 215 Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Reported-by: Dan Carpenter Reviewed-by: Dan Carpenter Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/078a2ef5bdc4e3b2c25ef852461692001f426495.1604976945.git.geliangtang@gmail.com Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ed60538df7b2..446ef8f07734 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -206,7 +206,6 @@ static void mptcp_pm_add_timer(struct timer_list *timer) struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); pr_debug("msk=%p", msk); @@ -234,7 +233,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(net)); + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); -- cgit v1.2.3 From e2223995a2872c0e23ed44e1dbb493817b567666 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:03 +0100 Subject: mptcp: use tcp_build_frag() mptcp_push_pending() is called even on orphaned msk (and orphaned subflows), if there is outstanding data at close() time. To cope with the above MPTCP needs to handle explicitly the allocation failure on xmit. The newly introduced do_tcp_sendfrag() allows that, just plug it. We can additionally drop a couple of sanity checks, duplicate in the TCP code. Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index eaa61e227492..3c68cf912fb8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -992,17 +992,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, psize = min_t(size_t, dfrag->data_len, avail_size); } - /* tell the TCP stack to delay the push so that we can safely - * access the skb after the sendpages call - */ - ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); - if (ret <= 0) { - if (!retransmission) - iov_iter_revert(&msg->msg_iter, psize); - return ret; + tail = tcp_build_frag(ssk, psize, msg->msg_flags, page, offset, &psize); + if (!tail) { + tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + return -ENOMEM; } + ret = psize; frag_truesize += ret; if (!retransmission) { if (unlikely(ret < psize)) @@ -1026,20 +1022,15 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, sk->sk_forward_alloc -= frag_truesize; } - /* if the tail skb extension is still the cached one, collapsing - * really happened. Note: we can't check for 'same skb' as the sk_buff - * hdr on tail can be transmitted, freed and re-allocated by the - * do_tcp_sendpages() call + /* if the tail skb is still the cached one, collapsing really happened. */ - tail = tcp_write_queue_tail(ssk); - if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + if (skb == tail) { WARN_ON_ONCE(!can_collapse); mpext->data_len += ret; goto out; } - skb = tcp_write_queue_tail(ssk); - mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); + mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext); msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); -- cgit v1.2.3 From ba8f48f7a4d79352b764ace585b5f602ef940be0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:05 +0100 Subject: mptcp: introduce mptcp_schedule_work remove some of code duplications an allow preventing rescheduling on close. Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +-- net/mptcp/protocol.c | 36 ++++++++++++++++++++++-------------- net/mptcp/protocol.h | 1 + 3 files changed, 24 insertions(+), 16 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e19e1525ecbb..f9c88e2abb8e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -89,8 +89,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + mptcp_schedule_work((struct sock *)msk); return true; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3c68cf912fb8..0e725f18af24 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -620,9 +620,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) * this is not a good place to change state. Let the workqueue * do it. */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); } spin_unlock_bh(&sk->sk_lock.slock); @@ -699,23 +698,32 @@ static void mptcp_reset_timer(struct sock *sk) sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } +bool mptcp_schedule_work(struct sock *sk) +{ + if (inet_sk_state_load(sk) != TCP_CLOSE && + schedule_work(&mptcp_sk(sk)->work)) { + /* each subflow already holds a reference to the sk, and the + * workqueue is invoked by a subflow, so sk can't go away here. + */ + sock_hold(sk); + return true; + } + return false; +} + void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && - schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + (inet_sk_state_load(sk) != TCP_ESTABLISHED))) + mptcp_schedule_work(sk); } void mptcp_subflow_eof(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && - schedule_work(&msk->work)) - sock_hold(sk); + if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); } static void mptcp_check_for_eof(struct mptcp_sock *msk) @@ -1620,8 +1628,7 @@ static void mptcp_retransmit_handler(struct sock *sk) mptcp_stop_timer(sk); } else { set_bit(MPTCP_WORK_RTX, &msk->flags); - if (schedule_work(&msk->work)) - sock_hold(sk); + mptcp_schedule_work(sk); } } @@ -2334,7 +2341,8 @@ static void mptcp_release_cb(struct sock *sk) struct sock *ssk; ssk = mptcp_subflow_recv_lookup(msk); - if (!ssk || !schedule_work(&msk->work)) + if (!ssk || sk->sk_state == TCP_CLOSE || + !schedule_work(&msk->work)) __sock_put(sk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 278c88c405e8..5211564a533f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -408,6 +408,7 @@ static inline bool mptcp_is_fully_established(struct sock *sk) void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); +bool mptcp_schedule_work(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); -- cgit v1.2.3 From caf971df01b86f33f151bcfa61b4385cf5e43822 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:06 +0100 Subject: mptcp: reduce the arguments of mptcp_sendmsg_frag The current argument list is pretty long and quite unreadable, move many of them into a specific struct. Later patches will add more stuff to such struct. Additionally drop the 'timeo' argument, now unused. Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 53 ++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0e725f18af24..4ed9b1cc3b1e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -914,12 +914,16 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, return dfrag; } +struct mptcp_sendmsg_info { + int mss_now; + int size_goal; +}; + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct msghdr *msg, struct mptcp_data_frag *dfrag, - long *timeo, int *pmss_now, - int *ps_goal) + struct mptcp_sendmsg_info *info) { - int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; + int avail_size, offset, ret, frag_truesize = 0; bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; @@ -945,10 +949,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, } /* compute copy limit */ - mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); - *pmss_now = mss_now; - *ps_goal = size_goal; - avail_size = size_goal; + info->mss_now = tcp_send_mss(ssk, &info->size_goal, msg->msg_flags); + avail_size = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { mpext = skb_ext_find(skb, SKB_EXT_MPTCP); @@ -959,12 +961,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * queue management operation, to avoid breaking the ext <-> * SSN association set here */ - can_collapse = (size_goal - skb->len > 0) && + can_collapse = (info->size_goal - skb->len > 0) && mptcp_skb_can_collapse_to(*write_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else - avail_size = size_goal - skb->len; + avail_size = info->size_goal - skb->len; } if (!retransmission) { @@ -1187,11 +1189,15 @@ static void ssk_check_wmem(struct mptcp_sock *msk) static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { - int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .mss_now = 0, + .size_goal = 0, + }; struct page_frag *pfrag; size_t copied = 0; struct sock *ssk; + int ret = 0; u32 sndbuf; bool tx_ok; long timeo; @@ -1260,8 +1266,7 @@ restart: lock_sock(ssk); tx_ok = msg_data_left(msg); while (tx_ok) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, - &size_goal); + ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &info); if (ret < 0) { if (ret == -EAGAIN && timeo > 0) { mptcp_set_timeout(sk, ssk); @@ -1284,8 +1289,8 @@ restart: if (!sk_stream_memory_free(ssk) || !mptcp_page_frag_refill(ssk, pfrag) || !mptcp_ext_cache_refill(msk)) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); + tcp_push(ssk, msg->msg_flags, info.mss_now, + tcp_sk(ssk)->nonagle, info.size_goal); mptcp_set_timeout(sk, ssk); release_sock(ssk); goto restart; @@ -1305,8 +1310,8 @@ restart: * limits before we send more data. */ if (unlikely(!sk_stream_memory_free(sk))) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); + tcp_push(ssk, msg->msg_flags, info.mss_now, + tcp_sk(ssk)->nonagle, info.size_goal); mptcp_clean_una(sk); if (!sk_stream_memory_free(sk)) { /* can't send more for now, need to wait for @@ -1323,8 +1328,8 @@ restart: mptcp_set_timeout(sk, ssk); if (copied) { - tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); + tcp_push(ssk, msg->msg_flags, info.mss_now, + tcp_sk(ssk)->nonagle, info.size_goal); /* start the timer, if it's not pending */ if (!mptcp_timer_pending(sk)) @@ -1763,14 +1768,15 @@ static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, mss_now = 0, size_goal = 0; + struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; + int orig_len, orig_offset; u64 orig_write_seq; size_t copied = 0; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - long timeo = 0; + int ret; lock_sock(sk); mptcp_clean_una_wakeup(sk); @@ -1809,8 +1815,7 @@ static void mptcp_worker(struct work_struct *work) orig_offset = dfrag->offset; orig_write_seq = dfrag->data_seq; while (dfrag->data_len > 0) { - int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, - &mss_now, &size_goal); + ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &info); if (ret < 0) break; @@ -1823,8 +1828,8 @@ static void mptcp_worker(struct work_struct *work) break; } if (copied) - tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); dfrag->data_seq = orig_write_seq; dfrag->offset = orig_offset; -- cgit v1.2.3 From f0e6a4cf11f16425ccdc69f4410e4fe59719a9ea Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:07 +0100 Subject: mptcp: add accounting for pending data Preparation patch to track the data pending in the msk write queue. No functional change introduced here Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + net/mptcp/protocol.h | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4ed9b1cc3b1e..40c4227ae9fe 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1859,6 +1859,7 @@ static int __mptcp_init_sock(struct sock *sk) __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; + msk->first_pending = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5211564a533f..ec17f9c367c5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -187,9 +187,10 @@ struct mptcp_pm_data { struct mptcp_data_frag { struct list_head list; u64 data_seq; - int data_len; - int offset; - int overhead; + u16 data_len; + u16 offset; + u16 overhead; + u16 already_sent; struct page *page; }; @@ -219,6 +220,7 @@ struct mptcp_sock { struct rb_root out_of_order_queue; struct list_head conn_list; struct list_head rtx_queue; + struct mptcp_data_frag *first_pending; struct list_head join_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ @@ -240,6 +242,36 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + + return READ_ONCE(msk->first_pending); +} + +static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *cur; + + cur = msk->first_pending; + return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : + list_next_entry(cur, list); +} + +static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->first_pending) + return NULL; + + if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) + return NULL; + + return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); -- cgit v1.2.3 From eaa2ffabfc35580da3fb1d31897fb696c514ea7a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:08 +0100 Subject: mptcp: introduce MPTCP snd_nxt Track the next MPTCP sequence number used on xmit, currently always equal to write_next. Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 4 ++-- net/mptcp/protocol.c | 7 +++++-- net/mptcp/protocol.h | 17 +++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a044dd43411d..a6b57021b6d0 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -813,7 +813,7 @@ static void update_una(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 write_seq = READ_ONCE(msk->write_seq); + u64 snd_nxt = READ_ONCE(msk->snd_nxt); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way @@ -822,7 +822,7 @@ static void update_una(struct mptcp_sock *msk, new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, write_seq)) + if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; while (after64(new_snd_una, old_snd_una)) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 40c4227ae9fe..2b0fd08b03f9 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -834,7 +834,7 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->write_seq); + atomic64_set(&msk->snd_una, msk->snd_nxt); snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { @@ -1338,6 +1338,7 @@ restart: release_sock(ssk); out: + msk->snd_nxt = msk->write_seq; ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; @@ -1629,7 +1630,7 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { + if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) { mptcp_stop_timer(sk); } else { set_bit(MPTCP_WORK_RTX, &msk->flags); @@ -2100,6 +2101,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; + msk->snd_nxt = msk->write_seq; atomic64_set(&msk->snd_una, msk->write_seq); if (mp_opt->mp_capable) { msk->can_ack = true; @@ -2409,6 +2411,7 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); atomic64_set(&msk->snd_una, msk->write_seq); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ec17f9c367c5..946319cf9cca 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -92,6 +92,13 @@ #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) + struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; @@ -201,6 +208,7 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 snd_nxt; u64 ack_seq; u64 rcv_data_fin_seq; struct sock *last_snd; @@ -276,7 +284,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (list_empty(&msk->rtx_queue)) + if (!before64(msk->snd_nxt, atomic64_read(&msk->snd_una))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); @@ -528,13 +536,6 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } -static inline bool before64(__u64 seq1, __u64 seq2) -{ - return (__s64)(seq1 - seq2) < 0; -} - -#define after64(seq2, seq1) before64(seq1, seq2) - void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) -- cgit v1.2.3 From e16163b6e2b720fb74e5af758546f6dad27e6c9e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:09 +0100 Subject: mptcp: refactor shutdown and close We must not close the subflows before all the MPTCP level data, comprising the DATA_FIN has been acked at the MPTCP level, otherwise we could be unable to retransmit as needed. __mptcp_wr_shutdown() shutdown is responsible to check for the correct status and close all subflows. Is called by the output path after spooling any data and at shutdown/close time. In a similar way, __mptcp_destroy_sock() is responsible to clean-up the MPTCP level status, and is called when the msk transition to TCP_CLOSE. The protocol level close() does not force anymore the TCP_CLOSE status, but orphan the msk socket and all the subflows. Orphaned msk sockets are forciby closed after a timeout or when all MPTCP-level data is acked. There is a caveat about keeping the orphaned subflows around: the TCP stack can asynchronusly call tcp_cleanup_ulp() on them via tcp_close(). To prevent accessing freed memory on later MPTCP level operations, the msk acquires a reference to each subflow socket and prevent subflow_ulp_release() from releasing the subflow context before __mptcp_destroy_sock(). The additional subflow references are released by __mptcp_done() and the async ULP release is detected checking ULP ops. If such field has been already cleared by the ULP release path, the dangling context is freed directly by __mptcp_done(). Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 2 +- net/mptcp/pm_netlink.c | 6 +- net/mptcp/protocol.c | 340 +++++++++++++++++++++++++++++++++++-------------- net/mptcp/protocol.h | 13 +- net/mptcp/subflow.c | 22 +++- 5 files changed, 273 insertions(+), 110 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a6b57021b6d0..1be272d2bd95 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -492,7 +492,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); + snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 446ef8f07734..f8a9d82a0ea8 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -416,14 +416,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (msk->pm.rm_id != subflow->remote_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_accepted--; @@ -452,14 +451,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (rm_id != subflow->local_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.local_addr_used--; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2b0fd08b03f9..daa51e657db8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #include #endif #include +#include #include "protocol.h" #include "mib.h" @@ -41,6 +42,8 @@ struct mptcp_skb_cb { static struct percpu_counter mptcp_sockets_allocated; +static void __mptcp_destroy_sock(struct sock *sk); + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -102,6 +105,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); + sock_hold(ssock->sk); subflow->request_mptcp = 1; /* accept() will wait on first subflow sk_wq, and we always wakes up @@ -323,6 +327,19 @@ static void mptcp_stop_timer(struct sock *sk) mptcp_sk(sk)->timer_ival = 0; } +static void mptcp_close_wake_up(struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +} + static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -341,20 +358,14 @@ static void mptcp_check_data_fin_ack(struct sock *sk) switch (sk->sk_state) { case TCP_FIN_WAIT1: inet_sk_state_store(sk, TCP_FIN_WAIT2); - sk->sk_state_change(sk); break; case TCP_CLOSING: case TCP_LAST_ACK: inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_state_change(sk); break; } - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_close_wake_up(sk); } } @@ -428,7 +439,6 @@ static void mptcp_check_data_fin(struct sock *sk) break; case TCP_FIN_WAIT2: inet_sk_state_store(sk, TCP_CLOSE); - // @@ Close subflows now? break; default: /* Other states not expected */ @@ -445,13 +455,7 @@ static void mptcp_check_data_fin(struct sock *sk) release_sock(ssk); } - sk->sk_state_change(sk); - - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_close_wake_up(sk); } } @@ -691,6 +695,10 @@ static void mptcp_reset_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; + /* prevent rescheduling on close */ + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + return; + /* should never be called with mptcp level timer cleared */ tout = READ_ONCE(mptcp_sk(sk)->timer_ival); if (WARN_ON_ONCE(!tout)) @@ -734,8 +742,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) receivers += !subflow->rx_eof; + if (receivers) + return; - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ @@ -745,6 +755,19 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + /* fallback sockets skip TCP_CLOSING - TCP will take care */ + inet_sk_state_store(sk, TCP_CLOSE); + break; + default: + return; + } + mptcp_close_wake_up(sk); } static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) @@ -1657,6 +1680,13 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } +static void mptcp_timeout_timer(struct timer_list *t) +{ + struct sock *sk = from_timer(sk, t, sk_timer); + + mptcp_schedule_work(sk); +} + /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * @@ -1703,20 +1733,43 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) * parent socket. */ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout) + struct mptcp_subflow_context *subflow) { - struct socket *sock = READ_ONCE(ssk->sk_socket); + bool dispose_socket = false; + struct socket *sock; list_del(&subflow->node); - if (sock && sock != sk->sk_socket) { - /* outgoing subflow */ - sock_release(sock); + lock_sock(ssk); + + /* if we are invoked by the msk cleanup code, the subflow is + * already orphaned + */ + sock = ssk->sk_socket; + if (sock) { + dispose_socket = sock != sk->sk_socket; + sock_orphan(ssk); + } + + /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops + * the ssk has been already destroyed, we just need to release the + * reference owned by msk; + */ + if (!inet_csk(ssk)->icsk_ulp_ops) { + kfree_rcu(subflow, rcu); } else { - /* incoming subflow */ - tcp_close(ssk, timeout); + /* otherwise ask tcp do dispose of ssk and subflow ctx */ + subflow->disposable = 1; + __tcp_close(ssk, 0); + + /* close acquired an extra ref */ + __sock_put(ssk); } + release_sock(ssk); + if (dispose_socket) + iput(SOCK_INODE(sock)); + + sock_put(ssk); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -1761,10 +1814,29 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) if (inet_sk_state_load(ssk) != TCP_CLOSE) continue; - __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + __mptcp_close_ssk((struct sock *)msk, ssk, subflow); } } +static bool mptcp_check_close_timeout(const struct sock *sk) +{ + s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; + struct mptcp_subflow_context *subflow; + + if (delta >= TCP_TIMEWAIT_LEN) + return true; + + /* if all subflows are in closed status don't bother with additional + * timeout + */ + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != + TCP_CLOSE) + return false; + } + return true; +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1777,9 +1849,14 @@ static void mptcp_worker(struct work_struct *work) struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - int ret; + int state, ret; lock_sock(sk); + set_bit(MPTCP_WORKER_RUNNING, &msk->flags); + state = sk->sk_state; + if (unlikely(state == TCP_CLOSE)) + goto unlock; + mptcp_clean_una_wakeup(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); @@ -1796,6 +1873,18 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_data_fin(sk); + /* if the msk data is completely acked, or the socket timedout, + * there is no point in keeping around an orphaned sk + */ + if (sock_flag(sk, SOCK_DEAD) && + (mptcp_check_close_timeout(sk) || + (state != sk->sk_state && + ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) { + inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_destroy_sock(sk); + goto unlock; + } + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1844,6 +1933,7 @@ reset_unlock: mptcp_reset_timer(sk); unlock: + clear_bit(MPTCP_WORKER_RUNNING, &msk->flags); release_sock(sk); sock_put(sk); } @@ -1869,7 +1959,7 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - + timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); return 0; } @@ -1914,8 +2004,12 @@ static void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (cancel_work_sync(&msk->work)) - sock_put(sk); + /* if called by the work itself, do not try to cancel the work, or + * we will hang. + */ + if (!test_bit(MPTCP_WORKER_RUNNING, &msk->flags) && + cancel_work_sync(&msk->work)) + __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) @@ -1973,42 +2067,61 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void mptcp_close(struct sock *sk, long timeout) +static void __mptcp_check_send_data_fin(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", + msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), + msk->snd_nxt, msk->write_seq); - if (sk->sk_state == TCP_LISTEN) { + /* we still need to enqueue subflows or not really shutting down, + * skip this + */ + if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || + mptcp_send_head(sk)) + return; + + WRITE_ONCE(msk->snd_nxt, msk->write_seq); + + /* fallback socket will not get data_fin/ack, can move to close now */ + if (__mptcp_check_fallback(msk) && sk->sk_state == TCP_LAST_ACK) { inet_sk_state_store(sk, TCP_CLOSE); - goto cleanup; - } else if (sk->sk_state == TCP_CLOSE) { - goto cleanup; + mptcp_close_wake_up(sk); } - if (__mptcp_check_fallback(msk)) { - goto update_state; - } else if (mptcp_close_state(sk)) { - pr_debug("Sending DATA_FIN sk=%p", sk); - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + __mptcp_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); - } + mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } +} - sk_stream_wait_close(sk, timeout); +static void __mptcp_wr_shutdown(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); -update_state: - inet_sk_state_store(sk, TCP_CLOSE); + pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", + msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, + !!mptcp_send_head(sk)); + + /* will be ignored by fallback sockets */ + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + __mptcp_check_send_data_fin(sk); +} + +static void __mptcp_destroy_sock(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + LIST_HEAD(conn_list); + + pr_debug("msk=%p", msk); -cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -2018,19 +2131,74 @@ cleanup: list_splice_init(&msk->conn_list, &conn_list); __mptcp_clear_xmit(sk); - - release_sock(sk); + sk_stop_timer(sk, &sk->sk_timer); + msk->pm.status = 0; list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); } - mptcp_cancel_work(sk); + sk->sk_prot->destroy(sk); - __skb_queue_purge(&sk->sk_receive_queue); + sk_stream_kill_queues(sk); + xfrm_sk_free_policy(sk); + sk_refcnt_debug_release(sk); + sock_put(sk); +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow; + bool do_cancel_work = false; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } - sk_common_release(sk); + if (mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); + + sk_stream_wait_close(sk, timeout); + +cleanup: + /* orphan all the subflows */ + inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; + list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow, dispose_socket; + struct socket *sock; + + slow = lock_sock_fast(ssk); + sock = ssk->sk_socket; + dispose_socket = sock && sock != sk->sk_socket; + sock_orphan(ssk); + unlock_sock_fast(ssk, slow); + + /* for the outgoing subflows we additionally need to free + * the associated socket + */ + if (dispose_socket) + iput(SOCK_INODE(sock)); + } + sock_orphan(sk); + + sock_hold(sk); + pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + do_cancel_work = true; + } else { + sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + } + release_sock(sk); + if (do_cancel_work) + mptcp_cancel_work(sk); + sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) @@ -2183,6 +2351,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); + sock_hold(ssk); mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(new_mptcp_sock); @@ -2430,9 +2599,9 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -bool mptcp_finish_join(struct sock *sk) +bool mptcp_finish_join(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; struct socket *parent_sock; @@ -2453,12 +2622,14 @@ bool mptcp_finish_join(struct sock *sk) /* active connections are already on conn_list, and we can't acquire * msk lock here. * use the join list lock as synchronization point and double-check - * msk status to avoid racing with mptcp_close() + * msk status to avoid racing with __mptcp_destroy_sock() */ spin_lock_bh(&msk->join_list_lock); ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) + if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { list_add_tail(&subflow->node, &msk->join_list); + sock_hold(ssk); + } spin_unlock_bh(&msk->join_list_lock); if (!ret) return false; @@ -2467,8 +2638,8 @@ bool mptcp_finish_join(struct sock *sk) * at close time */ parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !sk->sk_socket) - mptcp_sock_graft(sk, parent_sock); + if (parent_sock && !ssk->sk_socket) + mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); return true; } @@ -2704,12 +2875,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; + struct sock *sk = sock->sk; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); - lock_sock(sock->sk); + lock_sock(sk); how++; if ((how & ~SHUTDOWN_MASK) || !how) { @@ -2718,45 +2889,22 @@ static int mptcp_shutdown(struct socket *sock, int how) } if (sock->state == SS_CONNECTING) { - if ((1 << sock->sk->sk_state) & + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } - /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (__mptcp_check_fallback(msk)) { - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } else if ((how & SEND_SHUTDOWN) && - ((1 << sock->sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && - mptcp_close_state(sock->sk)) { - __mptcp_flush_join_list(msk); - - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } + sk->sk_shutdown |= how; + if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); /* Wake up anyone sleeping in poll. */ - sock->sk->sk_state_change(sock->sk); + sk->sk_state_change(sk); out_unlock: - release_sock(sock->sk); + release_sock(sk); return ret; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 946319cf9cca..fd9c666aed7f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -91,6 +91,7 @@ #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_WORKER_RUNNING 6 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -352,7 +353,8 @@ struct mptcp_subflow_context { mpc_map : 1, backup : 1, rx_eof : 1, - can_ack : 1; /* only after processing the remote a key */ + can_ack : 1, /* only after processing the remote a key */ + disposable : 1; /* ctx can be free at ulp release time */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -409,8 +411,7 @@ bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout); + struct mptcp_subflow_context *subflow); void mptcp_subflow_reset(struct sock *ssk); /* called with sk socket lock held */ @@ -452,6 +453,12 @@ bool mptcp_schedule_work(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); +static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->snd_data_fin_enable) && + READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); +} + void mptcp_destroy_common(struct mptcp_sock *msk); void __init mptcp_token_init(void); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ac4a1fe3550b..42581ffb0c7e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1125,6 +1125,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (err && err != -EINPROGRESS) goto failed; + sock_hold(ssk); spin_lock_bh(&msk->join_list_lock); list_add_tail(&subflow->node, &msk->join_list); spin_unlock_bh(&msk->join_list_lock); @@ -1132,6 +1133,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, return err; failed: + subflow->disposable = 1; sock_release(sf); return err; } @@ -1254,7 +1256,6 @@ static void subflow_state_change(struct sock *sk) mptcp_data_ready(parent, sk); if (__mptcp_check_fallback(mptcp_sk(parent)) && - !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1297,17 +1298,26 @@ out: return err; } -static void subflow_ulp_release(struct sock *sk) +static void subflow_ulp_release(struct sock *ssk) { - struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + bool release = true; + struct sock *sk; if (!ctx) return; - if (ctx->conn) - sock_put(ctx->conn); + sk = ctx->conn; + if (sk) { + /* if the msk has been orphaned, keep the ctx + * alive, will be freed by mptcp_done() + */ + release = ctx->disposable; + sock_put(sk); + } - kfree_rcu(ctx, rcu); + if (release) + kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, -- cgit v1.2.3 From d9ca1de8c0cd7a8ca2a0506e1741418741848e53 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:10 +0100 Subject: mptcp: move page frag allocation in mptcp_sendmsg() mptcp_sendmsg() is refactored so that first it copies the data provided from user space into the send queue, and then tries to spool the send queue via sendmsg_frag. There a subtle change in the mptcp level collapsing on consecutive data fragment: we now allow that only on unsent data. The latter don't need to deal with msghdr data anymore and can be simplified in a relevant way. snd_nxt and write_seq are now tracked independently. Overall this allows some relevant cleanup and will allow sending pending mptcp data on msk una update in later patch. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 406 ++++++++++++++++++++++++--------------------------- 1 file changed, 189 insertions(+), 217 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index daa51e657db8..9b30c4b39159 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -43,6 +43,7 @@ struct mptcp_skb_cb { static struct percpu_counter mptcp_sockets_allocated; static void __mptcp_destroy_sock(struct sock *sk); +static void __mptcp_check_send_data_fin(struct sock *sk); /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. @@ -814,6 +815,7 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && + pfrag->size - pfrag->offset > 0 && df->data_seq + df->data_len == msk->write_seq; } @@ -864,6 +866,8 @@ static void mptcp_clean_una(struct sock *sk) if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; + if (WARN_ON_ONCE(dfrag == msk->first_pending)) + break; dfrag_clear(sk, dfrag); cleaned = true; } @@ -872,12 +876,13 @@ static void mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->data_len)) + if (WARN_ON_ONCE(delta > dfrag->already_sent)) goto out; dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; + dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); cleaned = true; @@ -911,12 +916,23 @@ static void mptcp_clean_una_wakeup(struct sock *sk) */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + bool first = true; + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; - sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (first) + tcp_enter_memory_pressure(ssk); + sk_stream_moderate_sndbuf(ssk); + first = false; + } return false; } @@ -932,6 +948,7 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; @@ -940,121 +957,58 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, struct mptcp_sendmsg_info { int mss_now; int size_goal; + u16 limit; + u16 sent; + unsigned int flags; }; static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, struct mptcp_data_frag *dfrag, + struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { - int avail_size, offset, ret, frag_truesize = 0; - bool dfrag_collapsed, can_collapse = false; + u64 data_seq = dfrag->data_seq + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; - bool retransmission = !!dfrag; struct sk_buff *skb, *tail; - struct page_frag *pfrag; - struct page *page; - u64 *write_seq; - size_t psize; - - /* use the mptcp page cache so that we can easily move the data - * from one substream to another, but do per subflow memory accounting - * Note: pfrag is used only !retransmission, but the compiler if - * fooled into a warning if we don't init here - */ - pfrag = sk_page_frag(sk); - if (!retransmission) { - write_seq = &msk->write_seq; - page = pfrag->page; - } else { - write_seq = &dfrag->data_seq; - page = dfrag->page; - } + bool can_collapse = false; + int avail_size; + size_t ret; - /* compute copy limit */ - info->mss_now = tcp_send_mss(ssk, &info->size_goal, msg->msg_flags); + pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + + /* compute send limit */ + info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { - mpext = skb_ext_find(skb, SKB_EXT_MPTCP); - /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); can_collapse = (info->size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(*write_seq, skb, mpext); + mptcp_skb_can_collapse_to(data_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else avail_size = info->size_goal - skb->len; } - if (!retransmission) { - /* reuse tail pfrag, if possible, or carve a new one from the - * page allocator - */ - dfrag = mptcp_rtx_tail(sk); - offset = pfrag->offset; - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); - if (!dfrag_collapsed) { - dfrag = mptcp_carve_data_frag(msk, pfrag, offset); - offset = dfrag->offset; - frag_truesize = dfrag->overhead; - } - psize = min_t(size_t, pfrag->size - offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, offset, - min_t(size_t, msg_data_left(msg), - psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) { - iov_iter_revert(&msg->msg_iter, psize); - return -ENOMEM; - } - } else { - offset = dfrag->offset; - psize = min_t(size_t, dfrag->data_len, avail_size); - } - - tail = tcp_build_frag(ssk, psize, msg->msg_flags, page, offset, &psize); + ret = info->limit - info->sent; + tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, + dfrag->offset + info->sent, &ret); if (!tail) { tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); return -ENOMEM; } - ret = psize; - frag_truesize += ret; - if (!retransmission) { - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); - - /* send successful, keep track of sent data for mptcp-level - * retransmission - */ - dfrag->data_len += ret; - if (!dfrag_collapsed) { - get_page(dfrag->page); - list_add_tail(&dfrag->list, &msk->rtx_queue); - sk_wmem_queued_add(sk, frag_truesize); - } else { - sk_wmem_queued_add(sk, ret); - } - - /* charge data on mptcp rtx queue to the master socket - * Note: we charge such data both to sk and ssk - */ - sk->sk_forward_alloc -= frag_truesize; - } - /* if the tail skb is still the cached one, collapsing really happened. */ if (skb == tail) { @@ -1067,7 +1021,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = *write_seq; + mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -1078,11 +1032,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); out: - if (!retransmission) - pfrag->offset += frag_truesize; - WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; } @@ -1210,19 +1160,86 @@ static void ssk_check_wmem(struct mptcp_sock *msk) mptcp_nospace(msk); } -static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +static void mptcp_push_release(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) +{ + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); + release_sock(ssk); +} + +static void mptcp_push_pending(struct sock *sk, unsigned int flags) { + struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { - .mss_now = 0, - .size_goal = 0, + .flags = flags, }; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + u32 sndbuf; + + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + prev_ssk = ssk; + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, sndbuf); + + /* try to keep the subflow socket lock across + * consecutive xmit on the same socket + */ + if (ssk != prev_ssk && prev_ssk) + mptcp_push_release(sk, prev_ssk, &info); + if (!ssk) + goto out; + + if (ssk != prev_ssk || !prev_ssk) + lock_sock(ssk); + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + + /* at this point we held the socket lock for the last subflow we used */ + if (ssk) + mptcp_push_release(sk, ssk, &info); + +out: + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + if (copied) + __mptcp_check_send_data_fin(sk); +} + +static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; - struct sock *ssk; int ret = 0; - u32 sndbuf; - bool tx_ok; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -1239,129 +1256,93 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); -restart: mptcp_clean_una(sk); - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - - __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk, &sndbuf); - while (!sk_stream_memory_free(sk) || - !ssk || - !mptcp_page_frag_refill(ssk, pfrag)) { - if (ssk) { - /* make sure retransmit timer is - * running before we wait for memory. - * - * The retransmit timer might be needed - * to make the peer send an up-to-date - * MPTCP Ack. - */ - mptcp_set_timeout(sk, ssk); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); - } - - mptcp_nospace(msk); - ret = sk_stream_wait_memory(sk, &timeo); - if (ret) - goto out; - - mptcp_clean_una(sk); + while (msg_data_left(msg)) { + struct mptcp_data_frag *dfrag; + int frag_truesize = 0; + bool dfrag_collapsed; + size_t psize, offset; - ssk = mptcp_subflow_get_send(msk, &sndbuf); - if (list_empty(&msk->conn_list)) { - ret = -ENOTCONN; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; goto out; } - } - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, sndbuf); - - pr_debug("conn_list->subflow=%p", ssk); - - lock_sock(ssk); - tx_ok = msg_data_left(msg); - while (tx_ok) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &info); - if (ret < 0) { - if (ret == -EAGAIN && timeo > 0) { - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_pending_tail(sk); + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + if (!sk_stream_memory_free(sk)) { + mptcp_push_pending(sk, msg->msg_flags); + if (!sk_stream_memory_free(sk)) + goto wait_for_memory; } - break; + if (!mptcp_page_frag_refill(sk, pfrag)) + goto wait_for_memory; + + dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); + frag_truesize = dfrag->overhead; } - /* burst can be negative, we will try move to the next subflow - * at selection time, if possible. + /* we do not bound vs wspace, to allow a single packet. + * memory accounting will prevent execessive memory usage + * anyway */ - msk->snd_burst -= ret; - copied += ret; - - tx_ok = msg_data_left(msg); - if (!tx_ok) - break; - - if (!sk_stream_memory_free(ssk) || - !mptcp_page_frag_refill(ssk, pfrag) || - !mptcp_ext_cache_refill(msk)) { - tcp_push(ssk, msg->msg_flags, info.mss_now, - tcp_sk(ssk)->nonagle, info.size_goal); - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; + offset = dfrag->offset + dfrag->data_len; + psize = pfrag->size - offset; + psize = min_t(size_t, psize, msg_data_left(msg)); + if (!sk_wmem_schedule(sk, psize + frag_truesize)) + goto wait_for_memory; + + if (copy_page_from_iter(dfrag->page, offset, psize, + &msg->msg_iter) != psize) { + ret = -EFAULT; + goto out; } - /* memory is charged to mptcp level socket as well, i.e. - * if msg is very large, mptcp socket may run out of buffer - * space. mptcp_clean_una() will release data that has - * been acked at mptcp level in the mean time, so there is - * a good chance we can continue sending data right away. - * - * Normally, when the tcp subflow can accept more data, then - * so can the MPTCP socket. However, we need to cope with - * peers that might lag behind in their MPTCP-level - * acknowledgements, i.e. data might have been acked at - * tcp level only. So, we must also check the MPTCP socket - * limits before we send more data. + /* data successfully copied into the write queue */ + copied += psize; + dfrag->data_len += psize; + frag_truesize += psize; + pfrag->offset += frag_truesize; + WRITE_ONCE(msk->write_seq, msk->write_seq + psize); + + /* charge data on mptcp pending queue to the msk socket + * Note: we charge such data both to sk and ssk */ - if (unlikely(!sk_stream_memory_free(sk))) { - tcp_push(ssk, msg->msg_flags, info.mss_now, - tcp_sk(ssk)->nonagle, info.size_goal); - mptcp_clean_una(sk); - if (!sk_stream_memory_free(sk)) { - /* can't send more for now, need to wait for - * MPTCP-level ACKs from peer. - * - * Wakeup will happen via mptcp_clean_una(). - */ - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } + sk_wmem_queued_add(sk, frag_truesize); + sk->sk_forward_alloc -= frag_truesize; + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + if (!msk->first_pending) + WRITE_ONCE(msk->first_pending, dfrag); } - } + pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, + dfrag->data_seq, dfrag->data_len, dfrag->already_sent, + !dfrag_collapsed); - mptcp_set_timeout(sk, ssk); - if (copied) { - tcp_push(ssk, msg->msg_flags, info.mss_now, - tcp_sk(ssk)->nonagle, info.size_goal); + if (!mptcp_ext_cache_refill(msk)) + goto wait_for_memory; + continue; - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) +wait_for_memory: + mptcp_nospace(msk); + mptcp_clean_una(sk); + if (mptcp_timer_pending(sk)) mptcp_reset_timer(sk); + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; } - release_sock(ssk); + if (copied) + mptcp_push_pending(sk, msg->msg_flags); + out: - msk->snd_nxt = msk->write_seq; ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; @@ -1700,7 +1681,7 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); if (__mptcp_check_fallback(msk)) - return msk->first; + return NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1843,12 +1824,7 @@ static void mptcp_worker(struct work_struct *work) struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - int orig_len, orig_offset; - u64 orig_write_seq; size_t copied = 0; - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; int state, ret; lock_sock(sk); @@ -1901,18 +1877,17 @@ static void mptcp_worker(struct work_struct *work) lock_sock(ssk); - orig_len = dfrag->data_len; - orig_offset = dfrag->offset; - orig_write_seq = dfrag->data_seq; - while (dfrag->data_len > 0) { - ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &info); + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = dfrag->already_sent; + while (info.sent < dfrag->already_sent) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret < 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; - dfrag->data_len -= ret; - dfrag->offset += ret; + info.sent += ret; if (!mptcp_ext_cache_refill(msk)) break; @@ -1921,10 +1896,6 @@ static void mptcp_worker(struct work_struct *work) tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); - dfrag->data_seq = orig_write_seq; - dfrag->offset = orig_offset; - dfrag->data_len = orig_len; - mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -1996,6 +1967,7 @@ static void __mptcp_clear_xmit(struct sock *sk) sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } -- cgit v1.2.3 From 813e0a683d4cacb668622bc9a1693cb82b5f8ff8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:11 +0100 Subject: mptcp: try to push pending data on snd una updates After the previous patch we may end-up with unsent data in the write buffer. If such buffer is full, the writer will block for unlimited time. We need to trigger the MPTCP xmit path even for the subflow rx path, on MPTCP snd_una updates. Keep things simple and just schedule the work queue if needed. Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9b30c4b39159..821daa922ca3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -725,6 +725,7 @@ void mptcp_data_acked(struct sock *sk) mptcp_reset_timer(sk); if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || + mptcp_send_head(sk) || (inet_sk_state_load(sk) != TCP_ESTABLISHED))) mptcp_schedule_work(sk); } @@ -1840,6 +1841,8 @@ static void mptcp_worker(struct work_struct *work) __mptcp_close_subflow(msk); __mptcp_move_skbs(msk); + if (mptcp_send_head(sk)) + mptcp_push_pending(sk, 0); if (msk->pm.status) pm_work(msk); -- cgit v1.2.3 From 8edf08649eede6e5a3e39a3d38c63f30039a0c1e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Nov 2020 10:48:12 +0100 Subject: mptcp: rework poll+nospace handling MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at least one subflow and the mptcp socket itself are writeable. mptcp_poll returns EPOLLOUT if the bit is set. mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write has used up all subflows or the mptcp socket wmem. This reworks nospace handling as follows: MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning. This bit is set when the mptcp socket is not writeable. The mptcp-level ack path schedule will then schedule the mptcp worker to allow it to free already-acked data (and reduce wmem usage). This will then wake userspace processes that wait for a POLLOUT event. sendmsg will set MPTCP_NOSPACE only when it has to wait for more wmem (blocking I/O case). poll path will set MPTCP_NOSPACE in case the mptcp socket is not writeable. Normal tcp-level notification (SOCK_NOSPACE) is only enabled in case the subflow socket has no available wmem. Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 92 +++++++++++++++++++++++++++------------------------- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 11 +++---- 3 files changed, 54 insertions(+), 51 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 821daa922ca3..7fcd26011a3d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || + if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) || mptcp_send_head(sk) || (inet_sk_state_load(sk) != TCP_ESTABLISHED))) mptcp_schedule_work(sk); @@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static bool mptcp_is_writeable(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - if (!sk_stream_is_writeable((struct sock *)msk)) - return false; - - mptcp_for_each_subflow(msk, subflow) { - if (sk_stream_is_writeable(subflow->tcp_sock)) - return true; - } - return false; -} - static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk) mptcp_clean_una(sk); /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ + if (sk_stream_is_writeable(sk)) { + clear_bit(MPTCP_NOSPACE, &msk->flags); sk_stream_write_space(sk); } } @@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - clear_bit(MPTCP_SEND_SPACE, &msk->flags); + set_bit(MPTCP_NOSPACE, &msk->flags); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool ssk_writeable = sk_stream_is_writeable(ssk); struct socket *sock = READ_ONCE(ssk->sk_socket); + if (ssk_writeable || !sock) + continue; + /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); + set_bit(SOCK_NOSPACE, &sock->flags); } + + /* mptcp_data_acked() could run just before we set the NOSPACE bit, + * so explicitly check for snd_una value + */ + mptcp_clean_una((struct sock *)msk); } static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) @@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) -{ - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); -} - static void mptcp_push_release(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { @@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) wait_for_memory: mptcp_nospace(msk); - mptcp_clean_una(sk); if (mptcp_timer_pending(sk)) mptcp_reset_timer(sk); ret = sk_stream_wait_memory(sk, &timeo); @@ -1344,7 +1326,6 @@ wait_for_memory: mptcp_push_pending(sk, msg->msg_flags); out: - ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); - __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; @@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk) return true; } -static bool mptcp_memory_free(const struct sock *sk, int wake) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; -} - static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = { .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, @@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } +static bool __mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + bool mptcp_writable; + + mptcp_clean_una(sk); + mptcp_writable = sk_stream_is_writeable(sk); + if (!mptcp_writable) + mptcp_nospace(msk); + + return mptcp_writable; +} + +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + __poll_t ret = 0; + bool slow; + + if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) + return 0; + + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + slow = lock_sock_fast(sk); + if (__mptcp_check_writeable(msk)) + ret = EPOLLOUT | EPOLLWRNORM; + + unlock_sock_fast(sk, slow); + return ret; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + mask |= mptcp_check_writeable(msk); } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fd9c666aed7f..8345011fc0ba 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -86,7 +86,7 @@ /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 -#define MPTCP_SEND_SPACE 1 +#define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 42581ffb0c7e..794259789194 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk) static void subflow_write_space(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct socket *sock = READ_ONCE(sk->sk_socket); struct sock *parent = subflow->conn; if (!sk_stream_is_writeable(sk)) return; - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); - } + if (sock && sk_stream_is_writeable(parent)) + clear_bit(SOCK_NOSPACE, &sock->flags); + + sk_stream_write_space(parent); } static struct inet_connection_sock_af_ops * -- cgit v1.2.3 From 6f8a612a33e426d473f7161d1950dc00a613494b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Nov 2020 10:48:13 +0100 Subject: mptcp: keep track of advertised windows right edge Before sending 'x' new bytes also check that the new snd_una would be within the permitted receive window. For every ACK that also contains a DSS ack, check whether its tcp-level receive window would advance the current mptcp window right edge and update it if so. Signed-off-by: Florian Westphal Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 24 ++++++++++++++++++++---- net/mptcp/protocol.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- net/mptcp/protocol.h | 1 + 3 files changed, 69 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1be272d2bd95..f2d1e27a2bc1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -809,11 +809,14 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) return cur_ack; } -static void update_una(struct mptcp_sock *msk, - struct mptcp_options_received *mp_opt) +static void ack_update_msk(struct mptcp_sock *msk, + const struct sock *ssk, + struct mptcp_options_received *mp_opt) { u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); + u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end); u64 snd_nxt = READ_ONCE(msk->snd_nxt); + struct sock *sk = (struct sock *)msk; /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way @@ -825,12 +828,25 @@ static void update_una(struct mptcp_sock *msk, if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; + new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; + + while (after64(new_wnd_end, old_wnd_end)) { + wnd_end = old_wnd_end; + old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end, + new_wnd_end); + if (old_wnd_end == wnd_end) { + if (mptcp_send_head(sk)) + mptcp_schedule_work(sk); + break; + } + } + while (after64(new_snd_una, old_snd_una)) { snd_una = old_snd_una; old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, new_snd_una); if (old_snd_una == snd_una) { - mptcp_data_acked((struct sock *)msk); + mptcp_data_acked(sk); break; } } @@ -930,7 +946,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * monodirectional flows will stuck */ if (mp_opt.use_ack) - update_una(msk, &mp_opt); + ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7fcd26011a3d..5a92b9239909 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -57,6 +57,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } +/* Returns end sequence number of the receiver's advertised window */ +static u64 mptcp_wnd_end(const struct mptcp_sock *msk) +{ + return atomic64_read(&msk->wnd_end); +} + static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; @@ -174,6 +180,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) if (after64(seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); + pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -847,6 +854,7 @@ static void mptcp_clean_una(struct sock *sk) */ if (__mptcp_check_fallback(msk)) atomic64_set(&msk->snd_una, msk->snd_nxt); + snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { @@ -944,12 +952,30 @@ struct mptcp_sendmsg_info { unsigned int flags; }; +static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, + int avail_size) +{ + u64 window_end = mptcp_wnd_end(msk); + + if (__mptcp_check_fallback(msk)) + return avail_size; + + if (!before64(data_seq + avail_size, window_end)) { + u64 allowed_size = window_end - data_seq; + + return min_t(unsigned int, allowed_size, avail_size); + } + + return avail_size; +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); + bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; bool can_collapse = false; @@ -979,6 +1005,16 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, avail_size = info->size_goal - skb->len; } + /* Zero window and all data acked? Probe. */ + avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); + if (avail_size == 0) { + if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt) + return 0; + zero_window_probe = true; + data_seq = atomic64_read(&msk->snd_una) - 1; + avail_size = 1; + } + if (WARN_ON_ONCE(info->sent > info->limit || info->limit > dfrag->data_len)) return 0; @@ -996,6 +1032,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (skb == tail) { WARN_ON_ONCE(!can_collapse); mpext->data_len += ret; + WARN_ON_ONCE(zero_window_probe); goto out; } @@ -1013,6 +1050,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); + if (zero_window_probe) { + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mpext->frozen = 1; + ret = 0; + tcp_push_pending_frames(ssk); + } out: mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; @@ -1866,7 +1909,7 @@ static void mptcp_worker(struct work_struct *work) info.limit = dfrag->already_sent; while (info.sent < dfrag->already_sent) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret < 0) + if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); @@ -2226,6 +2269,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; atomic64_set(&msk->snd_una, msk->write_seq); + atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd); + if (mp_opt->mp_capable) { msk->can_ack = true; msk->remote_key = mp_opt->sndr_key; @@ -2258,6 +2303,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; + + atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 8345011fc0ba..b4c8dbe9236b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -215,6 +215,7 @@ struct mptcp_sock { struct sock *last_snd; int snd_burst; atomic64_t snd_una; + atomic64_t wnd_end; unsigned long timer_ival; u32 token; unsigned long flags; -- cgit v1.2.3 From 7ed90803a213736290bdcf971764ddb8ff3fa44f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 16 Nov 2020 10:48:14 +0100 Subject: mptcp: send explicit ack on delayed ack_seq incr When the worker moves some bytes from the OoO queue into the receive queue, the msk->ask_seq is updated, the MPTCP-level ack carrying that value needs to wait the next ingress packet, possibly slowing down or hanging the peer Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5a92b9239909..8df013daea88 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -407,13 +407,27 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_check_data_fin(struct sock *sk) +static void mptcp_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + } +} + +static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; + bool ret = false; if (__mptcp_check_fallback(msk) || !msk->first) - return; + return ret; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -429,8 +443,6 @@ static void mptcp_check_data_fin(struct sock *sk) */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { - struct mptcp_subflow_context *subflow; - WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); @@ -454,17 +466,12 @@ static void mptcp_check_data_fin(struct sock *sk) break; } + ret = true; mptcp_set_timeout(sk, NULL); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); - } - + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } + return ret; } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, @@ -1547,7 +1554,8 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) } while (!done); if (mptcp_ofo_queue(msk) || moved > 0) { - mptcp_check_data_fin((struct sock *)msk); + if (!mptcp_check_data_fin((struct sock *)msk)) + mptcp_send_ack(msk); return true; } return false; -- cgit v1.2.3 From b680a214ec281dbd44b5ebbf3f126a57f1ecf0f7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 18 Nov 2020 23:05:34 +0100 Subject: mptcp: update rtx timeout only if required. We must start the retransmission timer only there are pending data in the rtx queue. Otherwise we can hit a WARN_ON in mptcp_reset_timer(), as syzbot demonstrated. Reported-and-tested-by: syzbot+42aa53dafb66a07e5a24@syzkaller.appspotmail.com Fixes: d9ca1de8c0cd ("mptcp: move page frag allocation in mptcp_sendmsg()") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Reported-by: Naresh Kamboju Tested-by: Naresh Kamboju Link: https://lore.kernel.org/r/1a72039f112cae048c44d398ffa14e0a1432db3d.1605737083.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8df013daea88..aeda4357de9a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1261,11 +1261,12 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) mptcp_push_release(sk, ssk, &info); out: - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); - if (copied) + if (copied) { + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); __mptcp_check_send_data_fin(sk); + } } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -- cgit v1.2.3 From b2771d2419fa6e978dec9ba6ccb93c5c76106374 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Nov 2020 11:45:54 -0800 Subject: mptcp: drop WORKER_RUNNING status bit Only mptcp_close() can actually cancel the workqueue, no need to add and use this flag. Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 8 +------- net/mptcp/protocol.h | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index aeda4357de9a..1aaf58c59f41 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1862,7 +1862,6 @@ static void mptcp_worker(struct work_struct *work) int state, ret; lock_sock(sk); - set_bit(MPTCP_WORKER_RUNNING, &msk->flags); state = sk->sk_state; if (unlikely(state == TCP_CLOSE)) goto unlock; @@ -1940,7 +1939,6 @@ reset_unlock: mptcp_reset_timer(sk); unlock: - clear_bit(MPTCP_WORKER_RUNNING, &msk->flags); release_sock(sk); sock_put(sk); } @@ -2011,11 +2009,7 @@ static void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - /* if called by the work itself, do not try to cancel the work, or - * we will hang. - */ - if (!test_bit(MPTCP_WORKER_RUNNING, &msk->flags) && - cancel_work_sync(&msk->work)) + if (cancel_work_sync(&msk->work)) __sock_put(sk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b4c8dbe9236b..10fffc5de9e4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -91,7 +91,6 @@ #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 -#define MPTCP_WORKER_RUNNING 6 static inline bool before64(__u64 seq1, __u64 seq2) { -- cgit v1.2.3 From 26aa231439fef49f11284ea9d9245e074d69197a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Nov 2020 11:45:55 -0800 Subject: mptcp: fix state tracking for fallback socket We need to cope with some more state transition for fallback sockets, or could still end-up moving to TCP_CLOSE too early and avoid spooling some pending data Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1aaf58c59f41..2d450865937d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -777,7 +777,9 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) inet_sk_state_store(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: - /* fallback sockets skip TCP_CLOSING - TCP will take care */ + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: inet_sk_state_store(sk, TCP_CLOSE); break; default: @@ -2086,10 +2088,16 @@ static void __mptcp_check_send_data_fin(struct sock *sk) WRITE_ONCE(msk->snd_nxt, msk->write_seq); - /* fallback socket will not get data_fin/ack, can move to close now */ - if (__mptcp_check_fallback(msk) && sk->sk_state == TCP_LAST_ACK) { - inet_sk_state_store(sk, TCP_CLOSE); - mptcp_close_wake_up(sk); + /* fallback socket will not get data_fin/ack, can move to the next + * state now + */ + if (__mptcp_check_fallback(msk)) { + if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_close_wake_up(sk); + } else if (sk->sk_state == TCP_FIN_WAIT1) { + inet_sk_state_store(sk, TCP_FIN_WAIT2); + } } __mptcp_flush_join_list(msk); -- cgit v1.2.3 From 860975c6f80adae9d2c7654bde04a99dd28bc94f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 19 Nov 2020 11:45:56 -0800 Subject: mptcp: skip to next candidate if subflow has unacked data In case a subflow path is blocked, MPTCP-level retransmit may not take place anymore because such subflow is likely to have unacked data left in its write queue. Ignore subflows that have experienced loss and test next candidate. Fixes: 3b1d6210a95773691 ("mptcp: implement and use MPTCP-level retransmission") Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2d450865937d..868c76cf0f15 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1726,8 +1726,11 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) continue; /* still data outstanding at TCP level? Don't retransmit. */ - if (!tcp_write_queue_empty(ssk)) + if (!tcp_write_queue_empty(ssk)) { + if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss) + continue; return NULL; + } if (subflow->backup) { if (!backup) -- cgit v1.2.3 From 0397c6d85f9c6f81f6dc3a0a166331b2475b325c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Nov 2020 11:45:58 -0800 Subject: mptcp: keep unaccepted MPC subflow into join list This will simplify all operation dealing with subflows before accept time (e.g. data fin processing, add_addr). The join list is already flushed by mptcp_stream_accept() before returning the newly created msk to the user space. This also fixes an potential bug present into the old code: conn_list was manipulated without helding the msk lock in mptcp_stream_accept(). Tested-by: Geliang Tang Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 24 ++++++++---------------- net/mptcp/protocol.h | 9 +++++++++ net/mptcp/subflow.c | 10 +++++----- 3 files changed, 22 insertions(+), 21 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 868c76cf0f15..212be3bef66e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2343,7 +2343,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; - struct sock *ssk = newsk; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; @@ -2358,22 +2357,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, /* acquire the 2nd reference for the owning socket */ sock_hold(new_mptcp_sock); - - local_bh_disable(); - bh_lock_sock(new_mptcp_sock); - msk = mptcp_sk(new_mptcp_sock); - msk->first = newsk; - newsk = new_mptcp_sock; - mptcp_copy_inaddrs(newsk, ssk); - list_add(&subflow->node, &msk->conn_list); - sock_hold(ssk); - - mptcp_rcv_space_init(msk, ssk); - bh_unlock_sock(new_mptcp_sock); - - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - local_bh_enable(); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); @@ -2824,6 +2809,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; + struct sock *newsk = newsock->sk; + bool slowpath; + + slowpath = lock_sock_fast(newsk); + mptcp_copy_inaddrs(newsk, msk->first); + mptcp_rcv_space_init(msk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. @@ -2835,6 +2826,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + unlock_sock_fast(newsk, slowpath); } if (inet_csk_listen_poll(ssock->sk)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 10fffc5de9e4..7affaf0b1941 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -403,6 +403,15 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } +static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + sock_hold(mptcp_subflow_tcp_sock(subflow)); + spin_lock_bh(&msk->join_list_lock); + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); +} + int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 794259789194..d3c6b3a5ad55 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -578,6 +578,10 @@ create_child: */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* link the newly created socket to the msk */ + mptcp_add_pending_subflow(mptcp_sk(new_msk), ctx); + WRITE_ONCE(mptcp_sk(new_msk)->first, child); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ @@ -1124,11 +1128,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (err && err != -EINPROGRESS) goto failed; - sock_hold(ssk); - spin_lock_bh(&msk->join_list_lock); - list_add_tail(&subflow->node, &msk->join_list); - spin_unlock_bh(&msk->join_list_lock); - + mptcp_add_pending_subflow(msk, subflow); return err; failed: -- cgit v1.2.3 From d91d322a72a390702376787b925711ce8338daec Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 Nov 2020 11:45:59 -0800 Subject: mptcp: change add_addr_signal type This patch changed the 'add_addr_signal' type from bool to char, so that we could encode the addr type there. Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 15 +++++++++------ net/mptcp/protocol.h | 15 ++++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index f9c88e2abb8e..c2c12f02a263 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -16,11 +16,15 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo) { + u8 add_addr = READ_ONCE(msk->pm.add_addr_signal); + pr_debug("msk=%p, local_id=%d", msk, addr->id); msk->pm.local = *addr; - WRITE_ONCE(msk->pm.add_addr_echo, echo); - WRITE_ONCE(msk->pm.add_addr_signal, true); + add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); + if (echo) + add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); + WRITE_ONCE(msk->pm.add_addr_signal, add_addr); return 0; } @@ -182,13 +186,13 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; - *echo = READ_ONCE(msk->pm.add_addr_echo); + *echo = mptcp_pm_should_add_signal_echo(msk); if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo)) goto out_unlock; *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.add_addr_signal, false); + WRITE_ONCE(msk->pm.add_addr_signal, 0); ret = true; out_unlock: @@ -232,11 +236,10 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.subflows = 0; msk->pm.rm_id = 0; WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.add_addr_signal, false); + WRITE_ONCE(msk->pm.add_addr_signal, 0); WRITE_ONCE(msk->pm.rm_addr_signal, false); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); - WRITE_ONCE(msk->pm.add_addr_echo, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7affaf0b1941..a62ffae621c2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -165,6 +165,11 @@ enum mptcp_pm_status { MPTCP_PM_SUBFLOW_ESTABLISHED, }; +enum mptcp_add_addr_status { + MPTCP_ADD_ADDR_SIGNAL, + MPTCP_ADD_ADDR_ECHO, +}; + struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; @@ -172,13 +177,12 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ - bool add_addr_signal; + u8 add_addr_signal; bool rm_addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; - bool add_addr_echo; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; @@ -516,7 +520,12 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal); + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); +} + +static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) -- cgit v1.2.3 From 84dfe3677a6f45b3d0dfdd564e55717a1a5e60cc Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 Nov 2020 11:46:00 -0800 Subject: mptcp: send out dedicated ADD_ADDR packet When ADD_ADDR suboption includes an IPv6 address, the size is 28 octets. It will not fit when other MPTCP suboptions are included in this packet, e.g. DSS. So here we send out an ADD_ADDR dedicated packet to carry only ADD_ADDR suboption, no other MPTCP suboptions. In mptcp_pm_announce_addr, we check whether this is an IPv6 ADD_ADDR. If it is, we set the flag MPTCP_ADD_ADDR_IPV6 to true. Then we call mptcp_pm_nl_add_addr_send_ack to sent out a new pure ACK packet. In mptcp_established_options_add_addr, we check whether this is a pure ACK packet for ADD_ADDR. If it is, we drop all other MPTCP suboptions in this packet, only put ADD_ADDR suboption in it. Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 25 ++++++++++++++++++++++--- net/mptcp/pm.c | 16 ++++++++++++++-- net/mptcp/pm_netlink.c | 29 +++++++++++++++++++++++++++++ net/mptcp/protocol.c | 6 +++++- net/mptcp/protocol.h | 10 ++++++++++ 5 files changed, 80 insertions(+), 6 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index f2d1e27a2bc1..d7afffcc87c1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -242,7 +242,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->add_addr = 1; mp_opt->addr_id = *ptr++; - pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); + pr_debug("ADD_ADDR%s: id=%d, echo=%d", + (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", + mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -573,17 +575,27 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, } #endif -static bool mptcp_established_options_add_addr(struct sock *sk, +static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + bool drop_other_suboptions = false; + unsigned int opt_size = *size; struct mptcp_addr_info saddr; bool echo; int len; + if (mptcp_pm_should_add_signal_ipv6(msk) && + skb && skb_is_tcp_pure_ack(skb)) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + remaining += opt_size; + drop_other_suboptions = true; + } + if (!mptcp_pm_should_add_signal(msk) || !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) return false; @@ -593,6 +605,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, return false; *size = len; + if (drop_other_suboptions) + *size -= opt_size; opts->addr_id = saddr.id; if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; @@ -678,7 +692,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; - if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) { + if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; @@ -759,6 +773,11 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto fully_established; } + if (mp_opt->add_addr) { + WRITE_ONCE(msk->fully_established, true); + return true; + } + /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index c2c12f02a263..75c5040e8d5d 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -24,6 +24,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); if (echo) add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); + if (addr->family == AF_INET6) + add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); WRITE_ONCE(msk->pm.add_addr_signal, add_addr); return 0; } @@ -153,14 +155,24 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_lock_bh(&pm->lock); - if (!READ_ONCE(pm->accept_addr)) + if (!READ_ONCE(pm->accept_addr)) { mptcp_pm_announce_addr(msk, addr, true); - else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + mptcp_pm_add_addr_send_ack(msk); + } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } spin_unlock_bh(&pm->lock); } +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) +{ + if (!mptcp_pm_should_add_signal_ipv6(msk)) + return; + + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) { struct mptcp_pm_data *pm = &msk->pm; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f8a9d82a0ea8..03f2c28f11f5 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -228,6 +228,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } @@ -328,6 +329,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_add_addr_send_ack(msk); } } else { /* pick failed, avoid fourther attempts later */ @@ -398,6 +400,33 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) spin_lock_bh(&msk->pm.lock); mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_nl_add_addr_send_ack(msk); +} + +void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!mptcp_pm_should_add_signal_ipv6(msk)) + return; + + __mptcp_flush_join_list(msk); + subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); + if (subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 add_addr; + + spin_unlock_bh(&msk->pm.lock); + pr_debug("send ack for add_addr6"); + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + spin_lock_bh(&msk->pm.lock); + + add_addr = READ_ONCE(msk->pm.add_addr_signal); + add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); + WRITE_ONCE(msk->pm.add_addr_signal, add_addr); + } } void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 212be3bef66e..8f303de7f500 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -690,7 +690,7 @@ wake: sk->sk_data_ready(sk); } -static void __mptcp_flush_join_list(struct mptcp_sock *msk) +void __mptcp_flush_join_list(struct mptcp_sock *msk) { if (likely(list_empty(&msk->join_list))) return; @@ -1808,6 +1808,10 @@ static void pm_work(struct mptcp_sock *msk) pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_nl_add_addr_send_ack(msk); + } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_nl_rm_addr_received(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a62ffae621c2..4d6dd4adaaaf 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -160,6 +160,7 @@ struct mptcp_addr_info { enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, @@ -168,6 +169,7 @@ enum mptcp_pm_status { enum mptcp_add_addr_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, + MPTCP_ADD_ADDR_IPV6, }; struct mptcp_pm_data { @@ -466,6 +468,7 @@ bool mptcp_schedule_work(struct sock *sk); void mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); +void __mptcp_flush_join_list(struct mptcp_sock *msk); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && @@ -506,6 +509,7 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk, void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); struct mptcp_pm_add_entry * @@ -528,6 +532,11 @@ static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } +static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); +} + static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.rm_addr_signal); @@ -552,6 +561,7 @@ void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); +void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -- cgit v1.2.3 From fa3fe2b150316b294f2c662653501273ff25bba8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 19 Nov 2020 11:46:02 -0800 Subject: mptcp: track window announced to peer OoO handling attempts to detect when packet is out-of-window by testing current ack sequence and remaining space vs. sequence number. This doesn't work reliably. Store the highest allowed sequence number that we've announced and use it to detect oow packets. Do this when mptcp options get written to the packet (wire format). For this to work we need to move the write_options call until after stack selected a new tcp window. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 3 ++- net/ipv4/tcp_output.c | 11 +++++++---- net/mptcp/options.c | 22 +++++++++++++++++++++- net/mptcp/protocol.c | 12 +++++++----- net/mptcp/protocol.h | 1 + 5 files changed, 38 insertions(+), 11 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 6e706d838e4e..b6cf07143a8a 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -88,7 +88,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts); void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, + struct mptcp_out_options *opts); /* move the skb extension owership, with the assumption that 'to' is * newly allocated diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 99905bc01d40..41880d3521ed 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -445,11 +445,12 @@ struct tcp_out_options { struct mptcp_out_options mptcp; }; -static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp, + struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) - mptcp_write_options(ptr, &opts->mptcp); + mptcp_write_options(ptr, tp, &opts->mptcp); #endif } @@ -701,7 +702,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options); - mptcp_options_write(ptr, opts); + mptcp_options_write(ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -1346,7 +1347,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); @@ -1357,6 +1357,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ th->window = htons(min(tp->rcv_wnd, 65535U)); } + + tcp_options_write((__be32 *)(th + 1), tp, &opts); + #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { diff --git a/net/mptcp/options.c b/net/mptcp/options.c index d7afffcc87c1..248e3930c0cb 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1010,7 +1010,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } } -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +static void mptcp_set_rwin(const struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + const struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + + ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + + if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); +} + +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, + struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { @@ -1167,4 +1184,7 @@ mp_capable_done: TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } + + if (tp) + mptcp_set_rwin(tp); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8f303de7f500..e1606e072218 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -168,19 +168,19 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; - int space; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - space = tcp_space(sk); - max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; + max_seq = READ_ONCE(msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); - if (after64(seq, max_seq)) { + if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); - pr_debug("oow by %ld", (unsigned long)seq - (unsigned long)max_seq); + pr_debug("oow by %lld, rcv_wnd_sent %llu\n", + (unsigned long long)end_seq - (unsigned long)max_seq, + (unsigned long long)msk->rcv_wnd_sent); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -2295,6 +2295,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -2587,6 +2588,7 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); atomic64_set(&msk->snd_una, msk->write_seq); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4d6dd4adaaaf..67f86818203f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -216,6 +216,7 @@ struct mptcp_sock { u64 write_seq; u64 snd_nxt; u64 ack_seq; + u64 rcv_wnd_sent; u64 rcv_data_fin_seq; struct sock *last_snd; int snd_burst; -- cgit v1.2.3 From ea4ca586b16ff2eb6157fe13969eb72d2403a3a1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Nov 2020 11:46:03 -0800 Subject: mptcp: refine MPTCP-level ack scheduling Send timely MPTCP-level ack is somewhat difficult when the insertion into the msk receive level is performed by the worker. It needs TCP-level dup-ack to notify the MPTCP-level ack_seq increase, as both the TCP-level ack seq and the rcv window are unchanged. We can actually avoid processing incoming data with the worker, and let the subflow or recevmsg() send ack as needed. When recvmsg() moves the skbs inside the msk receive queue, the msk space is still unchanged, so tcp_cleanup_rbuf() could end-up skipping TCP-level ack generation. Anyway, when __mptcp_move_skbs() is invoked, a known amount of bytes is going to be consumed soon: we update rcv wnd computation taking them in account. Additionally we need to explicitly trigger tcp_cleanup_rbuf() when recvmsg() consumes a significant amount of the receive buffer. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 1 + net/mptcp/protocol.c | 105 +++++++++++++++++++++++++-------------------------- net/mptcp/protocol.h | 8 ++++ net/mptcp/subflow.c | 4 +- 4 files changed, 61 insertions(+), 57 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 248e3930c0cb..8a59b3e44599 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e1606e072218..4b7794835fea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -407,16 +407,42 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_send_ack(struct mptcp_sock *msk) +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + +static void mptcp_send_ack(struct mptcp_sock *msk, bool force) { struct mptcp_subflow_context *subflow; + struct sock *pick = NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); + if (force) { + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + continue; + } + + /* if the hintes ssk is still active, use it */ + pick = ssk; + if (ssk == msk->ack_hint) + break; + } + if (!force && pick) { + lock_sock(pick); + tcp_cleanup_rbuf(pick, 1); + release_sock(pick); } } @@ -468,7 +494,7 @@ static bool mptcp_check_data_fin(struct sock *sk) ret = true; mptcp_set_timeout(sk, NULL); - mptcp_send_ack(msk); + mptcp_send_ack(msk, true); mptcp_close_wake_up(sk); } return ret; @@ -483,7 +509,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - u32 old_copied_seq; bool done = false; int sk_rbuf; @@ -500,7 +525,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); - old_copied_seq = tp->copied_seq; do { u32 map_remaining, offset; u32 seq = tp->copied_seq; @@ -564,11 +588,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); + msk->ack_hint = ssk; *bytes += moved; - if (tp->copied_seq != old_copied_seq) - tcp_cleanup_rbuf(ssk, 1); - return done; } @@ -672,19 +694,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) goto wake; - if (move_skbs_to_msk(msk, ssk)) - goto wake; + move_skbs_to_msk(msk, ssk); - /* mptcp socket is owned, release_cb should retry */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, - &sk->sk_tsq_flags)) { - sock_hold(sk); - - /* need to try again, its possible release_cb() has already - * been called after the test_and_set_bit() above. - */ - move_skbs_to_msk(msk, ssk); - } wake: if (wake) sk->sk_data_ready(sk); @@ -1095,18 +1106,6 @@ static void mptcp_nospace(struct mptcp_sock *msk) mptcp_clean_una((struct sock *)msk); } -static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) - return false; - - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); -} - #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ @@ -1534,7 +1533,7 @@ new_measure: msk->rcvq_space.time = mstamp; } -static bool __mptcp_move_skbs(struct mptcp_sock *msk) +static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) { unsigned int moved = 0; bool done; @@ -1553,12 +1552,16 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) slowpath = lock_sock_fast(ssk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + if (moved && rcv) { + WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); + tcp_cleanup_rbuf(ssk, 1); + WRITE_ONCE(msk->rmem_pending, 0); + } unlock_sock_fast(ssk, slowpath); } while (!done); if (mptcp_ofo_queue(msk) || moved > 0) { - if (!mptcp_check_data_fin((struct sock *)msk)) - mptcp_send_ack(msk); + mptcp_check_data_fin((struct sock *)msk); return true; } return false; @@ -1582,8 +1585,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); __mptcp_flush_join_list(msk); - while (len > (size_t)copied) { - int bytes_read; + for (;;) { + int bytes_read, old_space; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); if (unlikely(bytes_read < 0)) { @@ -1595,9 +1598,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; if (skb_queue_empty(&sk->sk_receive_queue) && - __mptcp_move_skbs(msk)) + __mptcp_move_skbs(msk, len - copied)) continue; + /* be sure to advertise window change */ + old_space = READ_ONCE(msk->old_wspace); + if ((tcp_space(sk) - old_space) >= old_space) + mptcp_send_ack(msk, false); + /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ @@ -1650,7 +1658,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* .. race-breaker: ssk might have gotten new data * after last __mptcp_move_skbs() returned false. */ - if (unlikely(__mptcp_move_skbs(msk))) + if (unlikely(__mptcp_move_skbs(msk, 0))) set_bit(MPTCP_DATA_READY, &msk->flags); } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { /* data to read but mptcp_wait_data() cleared DATA_READY */ @@ -1881,7 +1889,6 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); - __mptcp_move_skbs(msk); if (mptcp_send_head(sk)) mptcp_push_pending(sk, 0); @@ -1965,6 +1972,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; + msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -2500,8 +2508,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_WRITE_TIMER_DEFERRED) +#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) /* this is very alike tcp_release_cb() but we must handle differently a * different set of events @@ -2519,16 +2526,6 @@ static void mptcp_release_cb(struct sock *sk) sock_release_ownership(sk); - if (flags & TCPF_DELACK_TIMER_DEFERRED) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - ssk = mptcp_subflow_recv_lookup(msk); - if (!ssk || sk->sk_state == TCP_CLOSE || - !schedule_work(&msk->work)) - __sock_put(sk); - } - if (flags & TCPF_WRITE_TIMER_DEFERRED) { mptcp_retransmit_handler(sk); __sock_put(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 67f86818203f..82d5626323b1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -220,10 +220,12 @@ struct mptcp_sock { u64 rcv_data_fin_seq; struct sock *last_snd; int snd_burst; + int old_wspace; atomic64_t snd_una; atomic64_t wnd_end; unsigned long timer_ival; u32 token; + int rmem_pending; unsigned long flags; bool can_ack; bool fully_established; @@ -231,6 +233,7 @@ struct mptcp_sock { bool snd_data_fin_enable; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; + struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -258,6 +261,11 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +static inline int __mptcp_space(const struct sock *sk) +{ + return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending); +} + static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d3c6b3a5ad55..4d8abff1be18 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -850,8 +850,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; - if (incr) - tcp_cleanup_rbuf(ssk, incr); } static bool subflow_check_data_avail(struct sock *ssk) @@ -973,7 +971,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; - *space = tcp_space(sk); + *space = __mptcp_space(sk); *full_space = tcp_full_space(sk); } -- cgit v1.2.3 From b6d69fc8e8cfd1694e709c16e6192339bde68923 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Nov 2020 17:24:46 +0100 Subject: mptcp: put reference in mptcp timeout timer On close this timer might be scheduled. mptcp uses sk_reset_timer for this, so the a reference on the mptcp socket is taken. This causes a refcount leak which can for example be reproduced with 'mp_join_server_v4.pkt' from the mptcp-packetdrill repo. The leak has nothing to do with join requests, v1_mp_capable_bind_no_cs.pkt works too when replacing the last ack mpcapable to v1 instead of v0. unreferenced object 0xffff888109bba040 (size 2744): comm "packetdrill", [..] backtrace: [..] sk_prot_alloc.isra.0+0x2b/0xc0 [..] sk_clone_lock+0x2f/0x740 [..] mptcp_sk_clone+0x33/0x1a0 [..] subflow_syn_recv_sock+0x2b1/0x690 [..] Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Cc: Davide Caratti Signed-off-by: Florian Westphal Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20201124162446.11448-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4b7794835fea..dc979571f561 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1710,6 +1710,7 @@ static void mptcp_timeout_timer(struct timer_list *t) struct sock *sk = from_timer(sk, t, sk_timer); mptcp_schedule_work(sk); + sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp -- cgit v1.2.3 From fd8976790a6c1fee98bd7bc73754ddc9f01273ab Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 24 Nov 2020 22:51:24 +0100 Subject: mptcp: be careful on MPTCP-level ack. We can enter the main mptcp_recvmsg() loop even when no subflows are connected. As note by Eric, that would result in a divide by zero oops on ack generation. Address the issue by checking the subflow status before sending the ack. Additionally protect mptcp_recvmsg() against invocation with weird socket states. v1 -> v2: - removed unneeded inline keyword - Jakub Reported-and-suggested-by: Eric Dumazet Fixes: ea4ca586b16f ("mptcp: refine MPTCP-level ack scheduling") Signed-off-by: Paolo Abeni Link: https://lore.kernel.org/r/5370c0ae03449239e3d1674ddcfb090cf6f20abe.1606253206.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 67 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 18 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index dc979571f561..16e9cb1c79cc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -419,31 +419,57 @@ static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } -static void mptcp_send_ack(struct mptcp_sock *msk, bool force) +static bool tcp_can_send_ack(const struct sock *ssk) +{ + return !((1 << inet_sk_state_load(ssk)) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE)); +} + +static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct sock *pick = NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (force) { - lock_sock(ssk); + lock_sock(ssk); + if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); - release_sock(ssk); - continue; - } - - /* if the hintes ssk is still active, use it */ - pick = ssk; - if (ssk == msk->ack_hint) - break; + release_sock(ssk); } - if (!force && pick) { - lock_sock(pick); - tcp_cleanup_rbuf(pick, 1); - release_sock(pick); +} + +static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +{ + int ret; + + lock_sock(ssk); + ret = tcp_can_send_ack(ssk); + if (ret) + tcp_cleanup_rbuf(ssk, 1); + release_sock(ssk); + return ret; +} + +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + /* if the hinted ssk is still active, try to use it */ + if (likely(msk->ack_hint)) { + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (msk->ack_hint == ssk && + mptcp_subflow_cleanup_rbuf(ssk)) + return; + } } + + /* otherwise pick the first active subflow */ + mptcp_for_each_subflow(msk, subflow) + if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) + return; } static bool mptcp_check_data_fin(struct sock *sk) @@ -494,7 +520,7 @@ static bool mptcp_check_data_fin(struct sock *sk) ret = true; mptcp_set_timeout(sk, NULL); - mptcp_send_ack(msk, true); + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; @@ -1579,6 +1605,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return -EOPNOTSUPP; lock_sock(sk); + if (unlikely(sk->sk_state == TCP_LISTEN)) { + copied = -ENOTCONN; + goto out_err; + } + timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); @@ -1604,7 +1635,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* be sure to advertise window change */ old_space = READ_ONCE(msk->old_wspace); if ((tcp_space(sk) - old_space) >= old_space) - mptcp_send_ack(msk, false); + mptcp_cleanup_rbuf(msk); /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() -- cgit v1.2.3 From ad80b0fc6e7f56bb1b09af86749ff3014477cfe6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:22 +0100 Subject: mptcp: open code mptcp variant for lock_sock This allows invoking an additional callback under the socket spin lock. Will be used by the next patches to avoid additional spin lock contention. Acked-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + net/core/sock.c | 2 +- net/mptcp/protocol.h | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/include/net/sock.h b/include/net/sock.h index 80469c2c448d..f59764614e30 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1590,6 +1590,7 @@ static inline void lock_sock(struct sock *sk) lock_sock_nested(sk, 0); } +void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 9badbe7bb4e4..f0f096852876 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2486,7 +2486,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 82d5626323b1..6abac8238de3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -253,6 +253,19 @@ struct mptcp_sock { } rcvq_space; }; +#define mptcp_lock_sock(___sk, cb) do { \ + struct sock *__sk = (___sk); /* silence macro reuse warning */ \ + might_sleep(); \ + spin_lock_bh(&__sk->sk_lock.slock); \ + if (__sk->sk_lock.owned) \ + __lock_sock(__sk); \ + cb; \ + __sk->sk_lock.owned = 1; \ + spin_unlock(&__sk->sk_lock.slock); \ + mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ + local_bh_enable(); \ +} while (0) + #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) -- cgit v1.2.3 From e93da92896bc0ddc26e88bbc09e7e39b84366a38 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:23 +0100 Subject: mptcp: implement wmem reservation This leverages the previous commit to reserve the wmem required for the sendmsg() operation when the msk socket lock is first acquired. Some heuristics are used to get a reasonable [over] estimation of the whole memory required. If we can't forward alloc such amount fallback to a reasonable small chunk, otherwise enter the wait for memory path. When sendmsg() needs more memory it looks at wmem_reserved first and if that is exhausted, move more space from sk_forward_alloc. The reserved memory is not persistent and is released at the next socket unlock via the release_cb(). Overall this will simplify the next patch. Acked-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++---- net/mptcp/protocol.h | 1 + 2 files changed, 86 insertions(+), 7 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 16e9cb1c79cc..07fe484eefd1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -873,6 +873,81 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } +static int mptcp_wmem_with_overhead(int size) +{ + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); +} + +static void __mptcp_wmem_reserve(struct sock *sk, int size) +{ + int amount = mptcp_wmem_with_overhead(size); + struct mptcp_sock *msk = mptcp_sk(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + if (amount <= sk->sk_forward_alloc) + goto reserve; + + /* under memory pressure try to reserve at most a single page + * otherwise try to reserve the full estimate and fallback + * to a single page before entering the error path + */ + if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || + !sk_wmem_schedule(sk, amount)) { + if (amount <= PAGE_SIZE) + goto nomem; + + amount = PAGE_SIZE; + if (!sk_wmem_schedule(sk, amount)) + goto nomem; + } + +reserve: + msk->wmem_reserved = amount; + sk->sk_forward_alloc -= amount; + return; + +nomem: + /* we will wait for memory on next allocation */ + msk->wmem_reserved = -1; +} + +static void __mptcp_update_wmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->wmem_reserved) + return; + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + if (msk->wmem_reserved > 0) { + sk->sk_forward_alloc += msk->wmem_reserved; + msk->wmem_reserved = 0; + } +} + +static bool mptcp_wmem_alloc(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* check for pre-existing error condition */ + if (msk->wmem_reserved < 0) + return false; + + if (msk->wmem_reserved >= size) + goto account; + + if (!sk_wmem_schedule(sk, size)) + return false; + + sk->sk_forward_alloc -= size; + msk->wmem_reserved += size; + +account: + msk->wmem_reserved -= size; + return true; +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -930,7 +1005,7 @@ static void mptcp_clean_una(struct sock *sk) } out: - if (cleaned) + if (cleaned && tcp_under_memory_pressure(sk)) sk_mem_reclaim_partial(sk); } @@ -1307,7 +1382,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1356,11 +1431,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); - if (!sk_wmem_schedule(sk, psize + frag_truesize)) + if (!mptcp_wmem_alloc(sk, psize + frag_truesize)) goto wait_for_memory; if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { + msk->wmem_reserved += psize + frag_truesize; ret = -EFAULT; goto out; } @@ -1376,7 +1452,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); - sk->sk_forward_alloc -= frag_truesize; if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); @@ -2003,6 +2078,7 @@ static int __mptcp_init_sock(struct sock *sk) INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; + msk->wmem_reserved = 0; msk->ack_hint = NULL; msk->first = NULL; @@ -2197,6 +2273,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); + WARN_ON_ONCE(msk->wmem_reserved); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); @@ -2542,13 +2619,14 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) -/* this is very alike tcp_release_cb() but we must handle differently a - * different set of events - */ +/* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; + /* clear any wmem reservation and errors */ + __mptcp_update_wmem(sk); + do { flags = sk->sk_tsq_flags; if (!(flags & MPTCP_DEFERRED_ALL)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6abac8238de3..4cf355076e35 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -218,6 +218,7 @@ struct mptcp_sock { u64 ack_seq; u64 rcv_wnd_sent; u64 rcv_data_fin_seq; + int wmem_reserved; struct sock *last_snd; int snd_burst; int old_wspace; -- cgit v1.2.3 From 879526030c8b5e8bd786a6408730893b9b2958ea Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:24 +0100 Subject: mptcp: protect the rx path with the msk socket spinlock Such spinlock is currently used only to protect the 'owned' flag inside the socket lock itself. With this patch, we extend its scope to protect the whole msk receive path and sk_forward_memory. Given the above, we can always move data into the msk receive queue (and OoO queue) from the subflow. We leverage the previous commit, so that we need to acquire the spinlock in the tx path only when moving fwd memory. recvmsg() must now explicitly acquire the socket spinlock when moving skbs out of sk_receive_queue. To reduce the number of lock operations required we use a second rx queue and splice the first into the latter in mptcp_lock_sock(). Additionally rmem allocated memory is bulk-freed via release_cb() Acked-by: Florian Westphal Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 149 +++++++++++++++++++++++++++++++++++---------------- net/mptcp/protocol.h | 5 ++ 2 files changed, 107 insertions(+), 47 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 07fe484eefd1..2f40882c4279 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -453,15 +453,15 @@ static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { + struct sock *ack_hint = READ_ONCE(msk->ack_hint); struct mptcp_subflow_context *subflow; /* if the hinted ssk is still active, try to use it */ - if (likely(msk->ack_hint)) { + if (likely(ack_hint)) { mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (msk->ack_hint == ssk && - mptcp_subflow_cleanup_rbuf(ssk)) + if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) return; } } @@ -614,13 +614,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); - msk->ack_hint = ssk; + WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; return done; } -static bool mptcp_ofo_queue(struct mptcp_sock *msk) +static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; @@ -666,34 +666,27 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (READ_ONCE(sk->sk_lock.owned)) - return false; - - if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) - return false; - - /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) { - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_ofo_queue(msk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL)) - mptcp_schedule_work(sk); - } + mptcp_data_lock(sk); - spin_unlock_bh(&sk->sk_lock.slock); + __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + __mptcp_ofo_queue(msk); - return moved > 0; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); + mptcp_data_unlock(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) @@ -937,17 +930,30 @@ static bool mptcp_wmem_alloc(struct sock *sk, int size) if (msk->wmem_reserved >= size) goto account; - if (!sk_wmem_schedule(sk, size)) + mptcp_data_lock(sk); + if (!sk_wmem_schedule(sk, size)) { + mptcp_data_unlock(sk); return false; + } sk->sk_forward_alloc -= size; msk->wmem_reserved += size; + mptcp_data_unlock(sk); account: msk->wmem_reserved -= size; return true; } +static void mptcp_wmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + msk->wmem_reserved += size; +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -976,6 +982,7 @@ static void mptcp_clean_una(struct sock *sk) if (__mptcp_check_fallback(msk)) atomic64_set(&msk->snd_una, msk->snd_nxt); + mptcp_data_lock(sk); snd_una = atomic64_read(&msk->snd_una); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { @@ -1007,6 +1014,7 @@ static void mptcp_clean_una(struct sock *sk) out: if (cleaned && tcp_under_memory_pressure(sk)) sk_mem_reclaim_partial(sk); + mptcp_data_unlock(sk); } static void mptcp_clean_una_wakeup(struct sock *sk) @@ -1436,7 +1444,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { - msk->wmem_reserved += psize + frag_truesize; + mptcp_wmem_uncharge(sk, psize + frag_truesize); ret = -EFAULT; goto out; } @@ -1502,11 +1510,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len) { - struct sock *sk = (struct sock *)msk; struct sk_buff *skb; int copied = 0; - while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + while ((skb = skb_peek(&msk->receive_queue)) != NULL) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1526,7 +1533,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } - __skb_unlink(skb, &sk->sk_receive_queue); + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); if (copied >= len) @@ -1634,25 +1644,47 @@ new_measure: msk->rcvq_space.time = mstamp; } +static void __mptcp_update_rmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->rmem_released) + return; + + atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, msk->rmem_released); + msk->rmem_released = 0; +} + +static void __mptcp_splice_receive_queue(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); +} + static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) { + struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - bool done; - - /* avoid looping forever below on racing close */ - if (((struct sock *)msk)->sk_state == TCP_CLOSE) - return false; + bool ret, done; __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; - if (!ssk) + /* we can have data pending in the subflows only if the msk + * receive buffer was full at subflow_data_ready() time, + * that is an unlikely slow path. + */ + if (likely(!ssk)) break; slowpath = lock_sock_fast(ssk); + mptcp_data_lock(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + mptcp_data_unlock(sk); if (moved && rcv) { WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); tcp_cleanup_rbuf(ssk, 1); @@ -1661,11 +1693,19 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) unlock_sock_fast(ssk, slowpath); } while (!done); - if (mptcp_ofo_queue(msk) || moved > 0) { - mptcp_check_data_fin((struct sock *)msk); - return true; + /* acquire the data lock only if some input data is pending */ + ret = moved > 0; + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || + !skb_queue_empty_lockless(&sk->sk_receive_queue)) { + mptcp_data_lock(sk); + __mptcp_update_rmem(sk); + ret |= __mptcp_ofo_queue(msk); + __mptcp_splice_receive_queue(sk); + mptcp_data_unlock(sk); } - return false; + if (ret) + mptcp_check_data_fin((struct sock *)msk); + return !skb_queue_empty(&msk->receive_queue); } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1679,7 +1719,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; @@ -1689,7 +1729,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - __mptcp_flush_join_list(msk); for (;;) { int bytes_read, old_space; @@ -1703,7 +1742,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && + if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk, len - copied)) continue; @@ -1734,8 +1773,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* race breaker: the shutdown could be after the + * previous receive queue check + */ + if (__mptcp_move_skbs(msk, len - copied)) + continue; break; + } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -1757,7 +1802,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, mptcp_wait_data(sk, &timeo); } - if (skb_queue_empty(&sk->sk_receive_queue)) { + if (skb_queue_empty_lockless(&sk->sk_receive_queue) && + skb_queue_empty(&msk->receive_queue)) { /* entire backlog drained, clear DATA_READY. */ clear_bit(MPTCP_DATA_READY, &msk->flags); @@ -1773,7 +1819,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty(&sk->sk_receive_queue), copied); + skb_queue_empty_lockless(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -2076,9 +2122,11 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); + __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; + msk->rmem_released = 0; msk->ack_hint = NULL; msk->first = NULL; @@ -2274,6 +2322,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); @@ -2491,6 +2540,11 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, void mptcp_destroy_common(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; + + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -2626,6 +2680,7 @@ static void mptcp_release_cb(struct sock *sk) /* clear any wmem reservation and errors */ __mptcp_update_wmem(sk); + __mptcp_update_rmem(sk); do { flags = sk->sk_tsq_flags; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4cf355076e35..fe2efd923c5c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -227,6 +227,7 @@ struct mptcp_sock { unsigned long timer_ival; u32 token; int rmem_pending; + int rmem_released; unsigned long flags; bool can_ack; bool fully_established; @@ -238,6 +239,7 @@ struct mptcp_sock { struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; + struct sk_buff_head receive_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -267,6 +269,9 @@ struct mptcp_sock { local_bh_enable(); \ } while (0) +#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) +#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) + #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) -- cgit v1.2.3 From 724cfd2ee8aa12e933253bb7a8bccb743a6fa6ef Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:25 +0100 Subject: mptcp: allocate TX skbs in msk context Move the TX skbs allocation in mptcp_sendmsg() scope, and tentatively pre-allocate a skbs number proportional to the sendmsg() length. Use the ssk tx skb cache to prevent the subflow allocation. This allows removing the msk skb extension cache and will make possible the later patches. Acked-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 248 ++++++++++++++++++++++++++++++++++++++++++--------- net/mptcp/protocol.h | 4 +- 2 files changed, 210 insertions(+), 42 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2f40882c4279..75b4c4c50dbb 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -818,16 +818,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_close_wake_up(sk); } -static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) -{ - const struct sock *sk = (const struct sock *)msk; - - if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); - - return !!msk->cached_ext; -} - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -866,14 +856,22 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(int size) +static int mptcp_wmem_with_overhead(struct sock *sk, int size) { - return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); + struct mptcp_sock *msk = mptcp_sk(sk); + int ret, skbs; + + ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); + skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; + if (skbs < msk->skb_tx_cache.qlen) + return ret; + + return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); } static void __mptcp_wmem_reserve(struct sock *sk, int size) { - int amount = mptcp_wmem_with_overhead(size); + int amount = mptcp_wmem_with_overhead(sk, size); struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); @@ -954,6 +952,25 @@ static void mptcp_wmem_uncharge(struct sock *sk, int size) msk->wmem_reserved += size; } +static void mptcp_mem_reclaim_partial(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* if we are experiencing a transint allocation error, + * the forward allocation memory has been already + * released + */ + if (msk->wmem_reserved < 0) + return; + + mptcp_data_lock(sk); + sk->sk_forward_alloc += msk->wmem_reserved; + sk_mem_reclaim_partial(sk); + msk->wmem_reserved = sk->sk_forward_alloc; + sk->sk_forward_alloc = 0; + mptcp_data_unlock(sk); +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -1030,19 +1047,12 @@ static void mptcp_clean_una_wakeup(struct sock *sk) } } -/* ensure we get enough memory for the frag hdr, beyond some minimal amount of - * data - */ -static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; - if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), - pfrag, sk->sk_allocation))) - return true; - sk_stream_moderate_sndbuf(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1052,6 +1062,18 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) sk_stream_moderate_sndbuf(ssk); first = false; } +} + +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of + * data + */ +static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), + pfrag, sk->sk_allocation))) + return true; + + mptcp_enter_memory_pressure(sk); return false; } @@ -1098,6 +1120,128 @@ static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, return avail_size; } +static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) +{ + struct skb_ext *mpext = __skb_ext_alloc(gfp); + + if (!mpext) + return false; + __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); + return true; +} + +static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk) +{ + struct sk_buff *skb; + + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (likely(skb)) { + if (likely(__mptcp_add_ext(skb, sk->sk_allocation))) { + skb_reserve(skb, MAX_TCP_HEADER); + skb->reserved_tailroom = skb->end - skb->tail; + return skb; + } + __kfree_skb(skb); + } else { + mptcp_enter_memory_pressure(sk); + } + return NULL; +} + +static bool mptcp_tx_cache_refill(struct sock *sk, int size, + struct sk_buff_head *skbs, int *total_ts) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + int space_needed; + + if (unlikely(tcp_under_memory_pressure(sk))) { + mptcp_mem_reclaim_partial(sk); + + /* under pressure pre-allocate at most a single skb */ + if (msk->skb_tx_cache.qlen) + return true; + space_needed = msk->size_goal_cache; + } else { + space_needed = msk->tx_pending_data + size - + msk->skb_tx_cache.qlen * msk->size_goal_cache; + } + + while (space_needed > 0) { + skb = __mptcp_do_alloc_tx_skb(sk); + if (unlikely(!skb)) { + /* under memory pressure, try to pass the caller a + * single skb to allow forward progress + */ + while (skbs->qlen > 1) { + skb = __skb_dequeue_tail(skbs); + __kfree_skb(skb); + } + return skbs->qlen > 0; + } + + *total_ts += skb->truesize; + __skb_queue_tail(skbs, skb); + space_needed -= msk->size_goal_cache; + } + return true; +} + +static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + + if (ssk->sk_tx_skb_cache) { + skb = ssk->sk_tx_skb_cache; + if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && + !__mptcp_add_ext(skb, sk->sk_allocation))) + return false; + return true; + } + + skb = skb_peek(&msk->skb_tx_cache); + if (skb) { + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + skb = __skb_dequeue(&msk->skb_tx_cache); + if (WARN_ON_ONCE(!skb)) + return false; + + mptcp_wmem_uncharge(sk, skb->truesize); + ssk->sk_tx_skb_cache = skb; + return true; + } + + /* over memory limit, no point to try to allocate a new skb */ + return false; + } + + skb = __mptcp_do_alloc_tx_skb(sk); + if (!skb) + return false; + + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + ssk->sk_tx_skb_cache = skb; + return true; + } + kfree_skb(skb); + return false; +} + +static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +{ + return !ssk->sk_tx_skb_cache && + !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && + tcp_under_memory_pressure(sk); +} + +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +{ + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) + mptcp_mem_reclaim_partial(sk); + return __mptcp_alloc_tx_skb(sk, ssk); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1109,7 +1253,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct sk_buff *skb, *tail; bool can_collapse = false; int avail_size; - size_t ret; + size_t ret = 0; pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); @@ -1117,6 +1261,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; + msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1165,8 +1310,11 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, goto out; } - mpext = __skb_ext_set(tail, SKB_EXT_MPTCP, msk->cached_ext); - msk->cached_ext = NULL; + mpext = skb_ext_find(tail, SKB_EXT_MPTCP); + if (WARN_ON_ONCE(!mpext)) { + /* should never reach here, stream corrupted */ + return -EINVAL; + } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; @@ -1239,9 +1387,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, sock_owned_by_me((struct sock *)msk); *sndbuf = 0; - if (!mptcp_ext_cache_refill(msk)) - return NULL; - if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; @@ -1350,6 +1495,15 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) if (ssk != prev_ssk || !prev_ssk) lock_sock(ssk); + /* keep it simple and always provide a new skb for the + * subflow, even if we will not use it when collapsing + * on the pending one + */ + if (!mptcp_alloc_tx_skb(sk, ssk)) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { mptcp_push_release(sk, ssk, &info); @@ -1360,6 +1514,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags) dfrag->already_sent += ret; msk->snd_nxt += ret; msk->snd_burst -= ret; + msk->tx_pending_data -= ret; copied += ret; len -= ret; } @@ -1404,8 +1559,9 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) mptcp_clean_una(sk); while (msg_data_left(msg)) { + int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - int frag_truesize = 0; + struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1439,9 +1595,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); - if (!mptcp_wmem_alloc(sk, psize + frag_truesize)) + total_ts = psize + frag_truesize; + __skb_queue_head_init(&skbs); + if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) goto wait_for_memory; + if (!mptcp_wmem_alloc(sk, total_ts)) { + __skb_queue_purge(&skbs); + goto wait_for_memory; + } + + skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { mptcp_wmem_uncharge(sk, psize + frag_truesize); @@ -1470,8 +1634,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); - if (!mptcp_ext_cache_refill(msk)) - goto wait_for_memory; continue; wait_for_memory: @@ -1483,8 +1645,10 @@ wait_for_memory: goto out; } - if (copied) + if (copied) { + msk->tx_pending_data += copied; mptcp_push_pending(sk, msg->msg_flags); + } out: release_sock(sk); @@ -2072,9 +2236,6 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; - if (!mptcp_ext_cache_refill(msk)) - goto reset_unlock; - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; @@ -2085,6 +2246,9 @@ static void mptcp_worker(struct work_struct *work) info.sent = 0; info.limit = dfrag->already_sent; while (info.sent < dfrag->already_sent) { + if (!mptcp_alloc_tx_skb(sk, ssk)) + break; + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; @@ -2092,9 +2256,6 @@ static void mptcp_worker(struct work_struct *work) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; - - if (!mptcp_ext_cache_refill(msk)) - break; } if (copied) tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, @@ -2123,10 +2284,13 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); + __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; msk->rmem_released = 0; + msk->tx_pending_data = 0; + msk->size_goal_cache = TCP_BASE_MSS; msk->ack_hint = NULL; msk->first = NULL; @@ -2170,12 +2334,17 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; + struct sk_buff *skb; sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); + while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { + sk->sk_forward_alloc += skb->truesize; + kfree_skb(skb); + } } static void mptcp_cancel_work(struct sock *sk) @@ -2554,9 +2723,6 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->cached_ext) - __skb_ext_put(msk->cached_ext); - mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fe2efd923c5c..97c1e5dcb3e2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -240,11 +240,13 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; + struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ + int tx_pending_data; + int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; - struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; -- cgit v1.2.3 From 7439d687b79cbbd971c6a170be9aefda4a564be4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:26 +0100 Subject: mptcp: avoid a few atomic ops in the rx path Extending the data_lock scope in mptcp_incoming_option we can use that to protect both snd_una and wnd_end. In the typical case, we will have a single atomic op instead of 2 Acked-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/mptcp_diag.c | 2 +- net/mptcp/options.c | 33 +++++++++++++-------------------- net/mptcp/protocol.c | 34 ++++++++++++++++------------------ net/mptcp/protocol.h | 8 ++++---- 4 files changed, 34 insertions(+), 43 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 5f390a97f556..b70ae4ba3000 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -140,7 +140,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_flags = flags; info->mptcpi_token = READ_ONCE(msk->token); info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8a59b3e44599..3986454a0340 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -833,15 +833,17 @@ static void ack_update_msk(struct mptcp_sock *msk, const struct sock *ssk, struct mptcp_options_received *mp_opt) { - u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end); - u64 snd_nxt = READ_ONCE(msk->snd_nxt); + u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; + u64 old_snd_una; + + mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ + old_snd_una = msk->snd_una; new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ @@ -850,26 +852,17 @@ static void ack_update_msk(struct mptcp_sock *msk, new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; - while (after64(new_wnd_end, old_wnd_end)) { - wnd_end = old_wnd_end; - old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end, - new_wnd_end); - if (old_wnd_end == wnd_end) { - if (mptcp_send_head(sk)) - mptcp_schedule_work(sk); - break; - } + if (after64(new_wnd_end, msk->wnd_end)) { + msk->wnd_end = new_wnd_end; + if (mptcp_send_head(sk)) + mptcp_schedule_work(sk); } - while (after64(new_snd_una, old_snd_una)) { - snd_una = old_snd_una; - old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, - new_snd_una); - if (old_snd_una == snd_una) { - mptcp_data_acked(sk); - break; - } + if (after64(new_snd_una, old_snd_una)) { + msk->snd_una = new_snd_una; + __mptcp_data_acked(sk); } + mptcp_data_unlock(sk); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 75b4c4c50dbb..51f92f3096bf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -60,7 +60,7 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { - return atomic64_read(&msk->wnd_end); + return READ_ONCE(msk->wnd_end); } static bool mptcp_is_tcpsk(struct sock *sk) @@ -358,7 +358,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) /* Look for an acknowledged DATA_FIN */ if (((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && - msk->write_seq == atomic64_read(&msk->snd_una)) { + msk->write_seq == READ_ONCE(msk->snd_una)) { mptcp_stop_timer(sk); WRITE_ONCE(msk->snd_data_fin_enable, 0); @@ -764,7 +764,7 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void mptcp_data_acked(struct sock *sk) +void __mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); @@ -997,11 +997,11 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->snd_nxt); + msk->snd_una = READ_ONCE(msk->snd_nxt); - mptcp_data_lock(sk); - snd_una = atomic64_read(&msk->snd_una); + mptcp_data_lock(sk); + snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; @@ -1282,10 +1282,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* Zero window and all data acked? Probe. */ avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); if (avail_size == 0) { - if (skb || atomic64_read(&msk->snd_una) != msk->snd_nxt) + u64 snd_una = READ_ONCE(msk->snd_una); + + if (skb || snd_una != msk->snd_nxt) return 0; zero_window_probe = true; - data_seq = atomic64_read(&msk->snd_una) - 1; + data_seq = snd_una - 1; avail_size = 1; } @@ -1994,12 +1996,8 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->snd_nxt)) { - mptcp_stop_timer(sk); - } else { - set_bit(MPTCP_WORK_RTX, &msk->flags); - mptcp_schedule_work(sk); - } + set_bit(MPTCP_WORK_RTX, &msk->flags); + mptcp_schedule_work(sk); } static void mptcp_retransmit_timer(struct timer_list *t) @@ -2621,8 +2619,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; - atomic64_set(&msk->snd_una, msk->write_seq); - atomic64_set(&msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd); + msk->snd_una = msk->write_seq; + msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; if (mp_opt->mp_capable) { msk->can_ack = true; @@ -2658,7 +2656,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; - atomic64_set(&msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, @@ -2918,7 +2916,7 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); - atomic64_set(&msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 97c1e5dcb3e2..3c07aafde10e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -222,8 +222,8 @@ struct mptcp_sock { struct sock *last_snd; int snd_burst; int old_wspace; - atomic64_t snd_una; - atomic64_t wnd_end; + u64 snd_una; + u64 wnd_end; unsigned long timer_ival; u32 token; int rmem_pending; @@ -321,7 +321,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (!before64(msk->snd_nxt, atomic64_read(&msk->snd_una))) + if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); @@ -495,7 +495,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); -void mptcp_data_acked(struct sock *sk); +void __mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); void __mptcp_flush_join_list(struct mptcp_sock *msk); -- cgit v1.2.3 From 6e628cd3a8f78cb0dfe85353e5e488bda296bedf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:27 +0100 Subject: mptcp: use mptcp release_cb for delayed tasks We have some tasks triggered by the subflow receive path which require to access the msk socket status, specifically: mptcp_clean_una() and mptcp_push_pending() We have almost everything in place to defer to the msk release_cb such tasks when the msk sock is owned. Since the worker is no more used to clean the acked data, for fallback sockets we need to explicitly flush them. As an added bonus we can move the wake-up code in __mptcp_clean_una(), simplify a lot mptcp_poll() and move the timer update under the data lock. The worker is now used only to process and send DATA_FIN packets and do the mptcp-level retransmissions. Acked-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 18 +++- net/mptcp/protocol.c | 250 ++++++++++++++++++++++++++++++--------------------- net/mptcp/protocol.h | 3 + net/mptcp/subflow.c | 14 +-- 4 files changed, 168 insertions(+), 117 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 3986454a0340..6b7b4b67f18c 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -830,7 +830,7 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) } static void ack_update_msk(struct mptcp_sock *msk, - const struct sock *ssk, + struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); @@ -854,8 +854,7 @@ static void ack_update_msk(struct mptcp_sock *msk, if (after64(new_wnd_end, msk->wnd_end)) { msk->wnd_end = new_wnd_end; - if (mptcp_send_head(sk)) - mptcp_schedule_work(sk); + __mptcp_wnd_updated(sk, ssk); } if (after64(new_snd_una, old_snd_una)) { @@ -915,8 +914,19 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; - if (__mptcp_check_fallback(msk)) + if (__mptcp_check_fallback(msk)) { + /* Keep it simple and unconditionally trigger send data cleanup and + * pending queue spooling. We will need to acquire the data lock + * for more accurate checks, and once the lock is acquired, such + * helpers are cheap. + */ + mptcp_data_lock(subflow->conn); + if (mptcp_send_head(subflow->conn)) + __mptcp_wnd_updated(subflow->conn, sk); + __mptcp_data_acked(subflow->conn); + mptcp_data_unlock(subflow->conn); return; + } mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 51f92f3096bf..221f7cdd416b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -348,17 +348,22 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } -static void mptcp_check_data_fin_ack(struct sock *sk) +static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (__mptcp_check_fallback(msk)) - return; + return !__mptcp_check_fallback(msk) && + ((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == READ_ONCE(msk->snd_una); +} + +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ - if (((1 << sk->sk_state) & - (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && - msk->write_seq == READ_ONCE(msk->snd_una)) { + if (mptcp_pending_data_fin_ack(sk)) { mptcp_stop_timer(sk); WRITE_ONCE(msk->snd_data_fin_enable, 0); @@ -764,16 +769,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void __mptcp_data_acked(struct sock *sk) -{ - mptcp_reset_timer(sk); - - if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) || - mptcp_send_head(sk) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED))) - mptcp_schedule_work(sk); -} - void mptcp_subflow_eof(struct sock *sk) { if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) @@ -986,7 +981,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static void mptcp_clean_una(struct sock *sk) +static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; @@ -999,8 +994,6 @@ static void mptcp_clean_una(struct sock *sk) if (__mptcp_check_fallback(msk)) msk->snd_una = READ_ONCE(msk->snd_nxt); - - mptcp_data_lock(sk); snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) @@ -1029,21 +1022,25 @@ static void mptcp_clean_una(struct sock *sk) } out: - if (cleaned && tcp_under_memory_pressure(sk)) - sk_mem_reclaim_partial(sk); - mptcp_data_unlock(sk); -} - -static void mptcp_clean_una_wakeup(struct sock *sk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); + if (cleaned) { + if (tcp_under_memory_pressure(sk)) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } - mptcp_clean_una(sk); + if (sk_stream_is_writeable(sk)) { + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags)) + sk_stream_write_space(sk); + } + } - /* Only wake up writers if a subflow is ready */ - if (sk_stream_is_writeable(sk)) { - clear_bit(MPTCP_NOSPACE, &msk->flags); - sk_stream_write_space(sk); + if (snd_una == READ_ONCE(msk->snd_nxt)) { + if (msk->timer_ival) + mptcp_stop_timer(sk); + } else { + mptcp_reset_timer(sk); } } @@ -1130,13 +1127,13 @@ static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) return true; } -static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk) +static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) { struct sk_buff *skb; - skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { - if (likely(__mptcp_add_ext(skb, sk->sk_allocation))) { + if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->reserved_tailroom = skb->end - skb->tail; return skb; @@ -1168,7 +1165,7 @@ static bool mptcp_tx_cache_refill(struct sock *sk, int size, } while (space_needed > 0) { - skb = __mptcp_do_alloc_tx_skb(sk); + skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); if (unlikely(!skb)) { /* under memory pressure, try to pass the caller a * single skb to allow forward progress @@ -1187,7 +1184,7 @@ static bool mptcp_tx_cache_refill(struct sock *sk, int size, return true; } -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; @@ -1195,7 +1192,7 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) if (ssk->sk_tx_skb_cache) { skb = ssk->sk_tx_skb_cache; if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && - !__mptcp_add_ext(skb, sk->sk_allocation))) + !__mptcp_add_ext(skb, gfp))) return false; return true; } @@ -1216,7 +1213,7 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) return false; } - skb = __mptcp_do_alloc_tx_skb(sk); + skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return false; @@ -1239,7 +1236,7 @@ static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) { if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) mptcp_mem_reclaim_partial(sk); - return __mptcp_alloc_tx_skb(sk, ssk); + return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, @@ -1340,31 +1337,6 @@ out: return ret; } -static void mptcp_nospace(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - set_bit(MPTCP_NOSPACE, &msk->flags); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool ssk_writeable = sk_stream_is_writeable(ssk); - struct socket *sock = READ_ONCE(ssk->sk_socket); - - if (ssk_writeable || !sock) - continue; - - /* enables ssk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); - } - - /* mptcp_data_acked() could run just before we set the NOSPACE bit, - * so explicitly check for snd_una value - */ - mptcp_clean_una((struct sock *)msk); -} - #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ @@ -1536,6 +1508,63 @@ out: } } +static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + + info.flags = 0; + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } + if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) + goto out; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + goto out; + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + +out: + /* __mptcp_alloc_tx_skb could have released some wmem and we are + * not going to flush it via release_sock() + */ + __mptcp_update_wmem(sk); + if (copied) { + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + if (msk->snd_data_fin_enable && + msk->snd_nxt + 1 == msk->write_seq) + mptcp_schedule_work(sk); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1558,7 +1587,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); - mptcp_clean_una(sk); while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; @@ -1578,11 +1606,9 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { - if (!sk_stream_memory_free(sk)) { - mptcp_push_pending(sk, msg->msg_flags); - if (!sk_stream_memory_free(sk)) - goto wait_for_memory; - } + if (!sk_stream_memory_free(sk)) + goto wait_for_memory; + if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; @@ -1639,9 +1665,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) continue; wait_for_memory: - mptcp_nospace(msk); - if (mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; @@ -2198,21 +2223,18 @@ static void mptcp_worker(struct work_struct *work) if (unlikely(state == TCP_CLOSE)) goto unlock; - mptcp_clean_una_wakeup(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); - if (mptcp_send_head(sk)) - mptcp_push_pending(sk, 0); - if (msk->pm.status) pm_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); /* if the msk data is completely acked, or the socket timedout, @@ -2334,8 +2356,6 @@ static void __mptcp_clear_xmit(struct sock *sk) struct mptcp_data_frag *dtmp, *dfrag; struct sk_buff *skb; - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); - WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); @@ -2477,7 +2497,7 @@ static void __mptcp_destroy_sock(struct sock *sk) spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - __mptcp_clear_xmit(sk); + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; @@ -2709,6 +2729,8 @@ void mptcp_destroy_common(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; + __mptcp_clear_xmit(sk); + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); @@ -2835,6 +2857,28 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } +void __mptcp_data_acked(struct sock *sk) +{ + if (!sock_owned_by_user(sk)) + __mptcp_clean_una(sk); + else + set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); + + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); +} + +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk) +{ + if (!mptcp_send_head(sk)) + return; + + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) /* processes deferred events and flush wmem */ @@ -2842,6 +2886,25 @@ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; + /* push_pending may touch wmem_reserved, do it before the later + * cleanup + */ + if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + __mptcp_clean_una(sk); + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) { + /* mptcp_push_pending() acquires the subflow socket lock + * + * 1) can't be invoked in atomic scope + * 2) must avoid ABBA deadlock with msk socket spinlock: the RX + * datapath acquires the msk socket spinlock while helding + * the subflow socket lock + */ + + spin_unlock_bh(&sk->sk_lock.slock); + mptcp_push_pending(sk, 0); + spin_lock_bh(&sk->sk_lock.slock); + } + /* clear any wmem reservation and errors */ __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); @@ -3177,24 +3240,9 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } -static bool __mptcp_check_writeable(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - bool mptcp_writable; - - mptcp_clean_una(sk); - mptcp_writable = sk_stream_is_writeable(sk); - if (!mptcp_writable) - mptcp_nospace(msk); - - return mptcp_writable; -} - static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - __poll_t ret = 0; - bool slow; if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) return 0; @@ -3202,12 +3250,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; - slow = lock_sock_fast(sk); - if (__mptcp_check_writeable(msk)) - ret = EPOLLOUT | EPOLLWRNORM; + set_bit(MPTCP_NOSPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; - unlock_sock_fast(sk, slow); - return ret; + return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3c07aafde10e..fc56e730fb35 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -91,6 +91,8 @@ #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_PUSH_PENDING 6 +#define MPTCP_CLEAN_UNA 7 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -495,6 +497,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 2e5c3f4da3a4..023c856424b7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -995,19 +995,9 @@ static void subflow_data_ready(struct sock *sk) mptcp_data_ready(parent, sk); } -static void subflow_write_space(struct sock *sk) +static void subflow_write_space(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct socket *sock = READ_ONCE(sk->sk_socket); - struct sock *parent = subflow->conn; - - if (!sk_stream_is_writeable(sk)) - return; - - if (sock && sk_stream_is_writeable(parent)) - clear_bit(SOCK_NOSPACE, &sock->flags); - - sk_stream_write_space(parent); + /* we take action in __mptcp_clean_una() */ } static struct inet_connection_sock_af_ops * -- cgit v1.2.3 From 05e3ecea4a6305597a060da0a123c80df8827bf1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Dec 2020 09:16:57 -0800 Subject: mptcp: avoid potential infinite loop in mptcp_recvmsg() If a packet is ready in receive queue, and application isssues a recvmsg()/recvfrom()/recvmmsg() request asking for zero bytes, we hang in mptcp_recvmsg(). Fixes: ea4ca586b16f ("mptcp: refine MPTCP-level ack scheduling") Signed-off-by: Eric Dumazet Tested-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20201202171657.1185108-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 221f7cdd416b..57213ff60f78 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1921,7 +1921,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - for (;;) { + while (copied < len) { int bytes_read, old_space; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); -- cgit v1.2.3 From 7ea851d19b23593d7601ecb8091d529f4f475bf5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 30 Nov 2020 16:36:30 +0100 Subject: tcp: merge 'init_req' and 'route_req' functions The Multipath-TCP standard (RFC 8684) says that an MPTCP host should send a TCP reset if the token in a MP_JOIN request is unknown. At this time we don't do this, the 3whs completes and the 'new subflow' is reset afterwards. There are two ways to allow MPTCP to send the reset. 1. override 'send_synack' callback and emit the rst from there. The drawback is that the request socket gets inserted into the listeners queue just to get removed again right away. 2. Send the reset from the 'route_req' function instead. This avoids the 'add&remove request socket', but route_req lacks the skb that is required to send the TCP reset. Instead of just adding the skb to that function for MPTCP sake alone, Paolo suggested to merge init_req and route_req functions. This saves one indirection from syn processing path and provides the skb to the merged function at the same time. 'send reset on unknown mptcp join token' is added in next patch. Suggested-by: Paolo Abeni Cc: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 ++++----- net/ipv4/tcp_input.c | 9 ++------- net/ipv4/tcp_ipv4.c | 9 +++++++-- net/ipv6/tcp_ipv6.c | 9 +++++++-- net/mptcp/subflow.c | 36 ++++++++++++++++++++++++------------ 5 files changed, 44 insertions(+), 28 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/tcp.h b/include/net/tcp.h index d5d22c411918..4525d6256321 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2007,15 +2007,14 @@ struct tcp_request_sock_ops { const struct sock *sk, const struct sk_buff *skb); #endif - void (*init_req)(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif - struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, - const struct request_sock *req); + struct dst_entry *(*route_req)(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb3a7750f623..9e8a6c1aa019 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6799,18 +6799,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) + dst = af_ops->route_req(sk, skb, &fl, req); + if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ if (!net->ipv4.sysctl_tcp_syncookies && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e4b31e70bd30..af2338294598 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1444,9 +1444,15 @@ static void tcp_v4_init_req(struct request_sock *req, } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v4_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet_csk_route_req(sk, &fl->u.ip4, req); } @@ -1466,7 +1472,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif - .init_req = tcp_v4_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v4_init_sequence, #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 992cbf3eb9e3..1a1510513739 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -828,9 +828,15 @@ static void tcp_v6_init_req(struct request_sock *req, } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, + struct sk_buff *skb, struct flowi *fl, - const struct request_sock *req) + struct request_sock *req) { + tcp_v6_init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + return NULL; + return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } @@ -851,7 +857,6 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif - .init_req = tcp_v6_init_req, #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 023c856424b7..727e607f40d2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -228,27 +228,39 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); -static void subflow_v4_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v4_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; - subflow_init_req(req, sk_listener, skb); + subflow_init_req(req, sk, skb); + return dst; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static void subflow_v6_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v6_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; - subflow_init_req(req, sk_listener, skb); + subflow_init_req(req, sk, skb); + return dst; } #endif @@ -1388,7 +1400,7 @@ void __init mptcp_subflow_init(void) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; - subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; @@ -1397,7 +1409,7 @@ void __init mptcp_subflow_init(void) #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; - subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; -- cgit v1.2.3 From 3ecfbe3e820997033beb4181c95d80d5c9ac6f85 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 30 Nov 2020 16:36:31 +0100 Subject: mptcp: emit tcp reset when a join request fails RFC 8684 says: If the token is unknown or the host wants to refuse subflow establishment (for example, due to a limit on the number of subflows it will permit), the receiver will send back a reset (RST) signal, analogous to an unknown port in TCP, containing an MP_TCPRST option (Section 3.6) with an "MPTCP specific error" reason code. mptcp-next doesn't support MP_TCPRST yet, this can be added in another change. Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 727e607f40d2..5f5815a1665f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -112,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li return 0; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +/* Init mptcp request socket. + * + * Returns an error code if a JOIN has failed and a TCP reset + * should be sent. + */ +static int subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -125,7 +130,7 @@ static void subflow_init_req(struct request_sock *req, ret = __subflow_init_req(req, sk_listener); if (ret) - return; + return 0; mptcp_get_options(skb, &mp_opt); @@ -133,7 +138,7 @@ static void subflow_init_req(struct request_sock *req, SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (mp_opt.mp_join) - return; + return 0; } else if (mp_opt.mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } @@ -157,7 +162,7 @@ again: } else { subflow_req->mp_capable = 1; } - return; + return 0; } err = mptcp_token_new_request(req); @@ -175,7 +180,11 @@ again: subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); - if (unlikely(req->syncookie) && subflow_req->msk) { + /* Can't fall back to TCP in this case. */ + if (!subflow_req->msk) + return -EPERM; + + if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); } @@ -183,6 +192,8 @@ again: pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } + + return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, @@ -234,6 +245,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct request_sock *req) { struct dst_entry *dst; + int err; tcp_rsk(req)->is_mptcp = 1; @@ -241,8 +253,14 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, if (!dst) return NULL; - subflow_init_req(req, sk, skb); - return dst; + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; + + dst_release(dst); + if (!req->syncookie) + tcp_request_sock_ops.send_reset(sk, skb); + return NULL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -252,6 +270,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct request_sock *req) { struct dst_entry *dst; + int err; tcp_rsk(req)->is_mptcp = 1; @@ -259,8 +278,14 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, if (!dst) return NULL; - subflow_init_req(req, sk, skb); - return dst; + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; + + dst_release(dst); + if (!req->syncookie) + tcp6_request_sock_ops.send_reset(sk, skb); + return NULL; } #endif -- cgit v1.2.3 From 6eb3d1e350d1d166e6210a2abc310dbee5f03247 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:18 -0800 Subject: mptcp: unify ADD_ADDR and echo suboptions writing There are two differences between ADD_ADDR suboption and ADD_ADDR echo suboption: The length of the former is 8 octets longer than the length of the latter. The former's echo-flag is 0, and latter's echo-flag is 1. This patch added two local variables, len and echo, to unify ADD_ADDR and ADD_ADDR echo suboptions writing. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 6b7b4b67f18c..c0cf0f5b9043 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1071,15 +1071,16 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, mp_capable_done: if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + u8 echo = MPTCP_ADDR_ECHO; + + if (opts->ahmac) { + len += sizeof(opts->ahmac); + echo = 0; + } + + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + len, echo, opts->addr_id); memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); ptr += 1; if (opts->ahmac) { @@ -1090,15 +1091,15 @@ mp_capable_done: #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); + u8 len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; + u8 echo = MPTCP_ADDR_ECHO; + + if (opts->ahmac) { + len += sizeof(opts->ahmac); + echo = 0; + } + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + len, echo, opts->addr_id); memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); ptr += 4; if (opts->ahmac) { -- cgit v1.2.3 From e1ef6832224aa62b36ba98a1a7c183e41962590c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:19 -0800 Subject: mptcp: unify ADD_ADDR and ADD_ADDR6 suboptions writing The length of ADD_ADDR6 is 12 octets longer than ADD_ADDR. That's the only difference between them. This patch dropped the duplicate code between ADD_ADDR and ADD_ADDR6 suboptions writing, and unify them into one. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c0cf0f5b9043..ab86f897c08b 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1070,10 +1070,19 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, } mp_capable_done: - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + if ((OPTION_MPTCP_ADD_ADDR +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + | OPTION_MPTCP_ADD_ADDR6 +#endif + ) & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; +#endif + if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; @@ -1081,33 +1090,21 @@ mp_capable_done: *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr_id); - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); - ptr += 1; - if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; } - } - #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - u8 len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; - u8 echo = MPTCP_ADDR_ECHO; - - if (opts->ahmac) { - len += sizeof(opts->ahmac); - echo = 0; + else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; } - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - len, echo, opts->addr_id); - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); - ptr += 4; +#endif if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } -#endif if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, -- cgit v1.2.3 From 22fb85ffaefb80a22c815008a500273b3f61bba3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:20 -0800 Subject: mptcp: add port support for ADD_ADDR suboption writing In rfc8684, the length of ADD_ADDR suboption with IPv4 address and port is 18 octets, but mptcp_write_options is 32-bit aligned, so we need to pad it to 20 octets. All the other port related option lengths need to be added up 2 octets similarly. This patch added a new field 'port' in mptcp_out_options. When this field is set with a port number, we need to add up 4 octets for the ADD_ADDR suboption, and put the port number into the suboption. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 1 + net/mptcp/options.c | 30 +++++++++++++++++++++++++++--- net/mptcp/protocol.h | 10 +++++----- 3 files changed, 33 insertions(+), 8 deletions(-) (limited to 'net/mptcp') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index b6cf07143a8a..5694370be3d4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -46,6 +46,7 @@ struct mptcp_out_options { #endif }; u8 addr_id; + u16 port; u64 ahmac; u8 rm_id; u8 join_id; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index ab86f897c08b..f841128a86c6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1083,6 +1083,9 @@ mp_capable_done: len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif + if (opts->port) + len += TCPOLEN_MPTCP_PORT_LEN; + if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; @@ -1100,9 +1103,30 @@ mp_capable_done: ptr += 4; } #endif - if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + + if (!opts->port) { + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } else { + if (opts->ahmac) { + u8 *bptr = (u8 *)ptr; + + put_unaligned_be16(opts->port, bptr); + bptr += 2; + put_unaligned_be64(opts->ahmac, bptr); + bptr += 8; + put_unaligned_be16(TCPOPT_NOP << 8 | + TCPOPT_NOP, bptr); + + ptr += 3; + } else { + put_unaligned_be32(opts->port << 16 | + TCPOPT_NOP << 8 | + TCPOPT_NOP, ptr); + ptr += 1; + } } } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fc56e730fb35..9032174b446a 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -49,14 +49,14 @@ #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 -#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 20 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 -#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 12 #define TCPOLEN_MPTCP_ADD_ADDR6 28 -#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 32 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 -#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 -#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 +#define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 /* MPTCP MP_JOIN flags */ -- cgit v1.2.3 From 2ec72faec86bc92c573fc3bada8001115670da44 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:21 -0800 Subject: mptcp: use adding up size to get ADD_ADDR length This patch uses adding up size to get the ADD_ADDR suboption length rather than returning the ADD_ADDR size constants. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9032174b446a..9315f6a8343a 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -577,10 +577,14 @@ static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) static inline unsigned int mptcp_add_addr_len(int family, bool echo) { - if (family == AF_INET) - return echo ? TCPOLEN_MPTCP_ADD_ADDR_BASE - : TCPOLEN_MPTCP_ADD_ADDR; - return echo ? TCPOLEN_MPTCP_ADD_ADDR6_BASE : TCPOLEN_MPTCP_ADD_ADDR6; + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + + if (family == AF_INET6) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; + if (!echo) + len += MPTCPOPT_THMAC_LEN; + + return len; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, -- cgit v1.2.3 From 4a2777a83406cc87edf08b5c71887d896f628525 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:22 -0800 Subject: mptcp: add the outgoing ADD_ADDR port support This patch added a new add_addr_signal type named MPTCP_ADD_ADDR_PORT, to identify it is an address with port to be added. It also added a new parameter 'port' for both mptcp_add_addr_len and mptcp_pm_add_addr_signal. In mptcp_established_options_add_addr, we check whether the announced address is added with port. If it is, we put this port number to mptcp_out_options's port field. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 10 +++++++--- net/mptcp/pm.c | 5 +++-- net/mptcp/protocol.h | 12 ++++++++++-- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index f841128a86c6..faae58cc7330 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -587,6 +587,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * unsigned int opt_size = *size; struct mptcp_addr_info saddr; bool echo; + bool port; int len; if (mptcp_pm_should_add_signal_ipv6(msk) && @@ -598,10 +599,10 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) + !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo); + len = mptcp_add_addr_len(saddr.family, echo, port); if (remaining < len) return false; @@ -609,6 +610,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * if (drop_other_suboptions) *size -= opt_size; opts->addr_id = saddr.id; + if (port) + opts->port = ntohs(saddr.port); if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; opts->addr = saddr.addr; @@ -631,7 +634,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } } #endif - pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo); + pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", + opts->addr_id, opts->ahmac, echo, opts->port); return true; } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 75c5040e8d5d..6d4be02681fa 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -188,7 +188,7 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) /* path manager helpers */ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo) + struct mptcp_addr_info *saddr, bool *echo, bool *port) { int ret = false; @@ -199,8 +199,9 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, goto out_unlock; *echo = mptcp_pm_should_add_signal_echo(msk); + *port = mptcp_pm_should_add_signal_port(msk); - if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo)) + if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo, *port)) goto out_unlock; *saddr = msk->pm.local; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9315f6a8343a..d69b4fc918cf 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -172,6 +172,7 @@ enum mptcp_add_addr_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_ADD_ADDR_IPV6, + MPTCP_ADD_ADDR_PORT, }; struct mptcp_pm_data { @@ -570,12 +571,17 @@ static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); } +static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); +} + static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.rm_addr_signal); } -static inline unsigned int mptcp_add_addr_len(int family, bool echo) +static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; @@ -583,12 +589,14 @@ static inline unsigned int mptcp_add_addr_len(int family, bool echo) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; + if (port) + len += TCPOLEN_MPTCP_PORT_LEN; return len; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo); + struct mptcp_addr_info *saddr, bool *echo, bool *port); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, u8 *rm_id); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -- cgit v1.2.3 From fbe0f87ac7710de31f9c37280b08e0d0d43aa6bf Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:23 -0800 Subject: mptcp: send out dedicated packet for ADD_ADDR using port The process is similar to that of the ADD_ADDR IPv6, this patch also sent out a pure ack for the ADD_ADDR using port. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 3 ++- net/mptcp/pm.c | 3 ++- net/mptcp/pm_netlink.c | 14 +++++++++++--- 3 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index faae58cc7330..9505b11a6577 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -590,7 +590,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * bool port; int len; - if (mptcp_pm_should_add_signal_ipv6(msk) && + if ((mptcp_pm_should_add_signal_ipv6(msk) || + mptcp_pm_should_add_signal_port(msk)) && skb && skb_is_tcp_pure_ack(skb)) { pr_debug("drop other suboptions"); opts->suboptions = 0; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6d4be02681fa..0c456747893e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -167,7 +167,8 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) { - if (!mptcp_pm_should_add_signal_ipv6(msk)) + if (!mptcp_pm_should_add_signal_ipv6(msk) && + !mptcp_pm_should_add_signal_port(msk)) return; mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 03f2c28f11f5..7a0f700e34bb 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -407,7 +407,8 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - if (!mptcp_pm_should_add_signal_ipv6(msk)) + if (!mptcp_pm_should_add_signal_ipv6(msk) && + !mptcp_pm_should_add_signal_port(msk)) return; __mptcp_flush_join_list(msk); @@ -417,14 +418,21 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) u8 add_addr; spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for add_addr6"); + if (mptcp_pm_should_add_signal_ipv6(msk)) + pr_debug("send ack for add_addr6"); + if (mptcp_pm_should_add_signal_port(msk)) + pr_debug("send ack for add_addr_port"); + lock_sock(ssk); tcp_send_ack(ssk); release_sock(ssk); spin_lock_bh(&msk->pm.lock); add_addr = READ_ONCE(msk->pm.add_addr_signal); - add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); + if (mptcp_pm_should_add_signal_ipv6(msk)) + add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); + if (mptcp_pm_should_add_signal_port(msk)) + add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT); WRITE_ONCE(msk->pm.add_addr_signal, add_addr); } } -- cgit v1.2.3 From 0f5c9e3f079f1d0355fd8f5e5ec7e3ada095eef4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:24 -0800 Subject: mptcp: add port parameter for mptcp_pm_announce_addr This patch added a new parameter 'port' for mptcp_pm_announce_addr. If this parameter is true, we set the MPTCP_ADD_ADDR_PORT bit of the add_addr_signal. That means the announced address is added with a port number. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm.c | 6 ++++-- net/mptcp/pm_netlink.c | 9 ++++++--- net/mptcp/protocol.h | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 0c456747893e..e63e14f4cf2a 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -14,7 +14,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo) + bool echo, bool port) { u8 add_addr = READ_ONCE(msk->pm.add_addr_signal); @@ -26,6 +26,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); if (addr->family == AF_INET6) add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); + if (port) + add_addr |= BIT(MPTCP_ADD_ADDR_PORT); WRITE_ONCE(msk->pm.add_addr_signal, add_addr); return 0; } @@ -156,7 +158,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_lock_bh(&pm->lock); if (!READ_ONCE(pm->accept_addr)) { - mptcp_pm_announce_addr(msk, addr, true); + mptcp_pm_announce_addr(msk, addr, true, addr->port); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7a0f700e34bb..2560c502356b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -227,7 +227,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_announce_addr(msk, &entry->addr, false, entry->addr.port); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } @@ -328,7 +328,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_announce_addr(msk, &local->addr, false, local->addr.port); mptcp_pm_nl_add_addr_send_ack(msk); } } else { @@ -376,6 +376,7 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; struct mptcp_addr_info remote; struct mptcp_addr_info local; + bool use_port = false; pr_debug("accepted %d:%d remote family %d", msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max, @@ -392,6 +393,8 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; + else + use_port = true; memset(&local, 0, sizeof(local)); local.family = remote.family; @@ -399,7 +402,7 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) __mptcp_subflow_connect((struct sock *)msk, &local, &remote); spin_lock_bh(&msk->pm.lock); - mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_announce_addr(msk, &remote, true, use_port); mptcp_pm_nl_add_addr_send_ack(msk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d69b4fc918cf..e880fa802cdf 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -552,7 +552,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo); + bool echo, bool port); int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); -- cgit v1.2.3 From 90a4aea8b6edff458977361be4b403779c84af80 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:25 -0800 Subject: mptcp: print out port and ahmac when receiving ADD_ADDR This patch printed out more debugging information for the ADD_ADDR suboption parsing on the incoming path. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9505b11a6577..d1b4c5d208a9 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -242,9 +242,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->add_addr = 1; mp_opt->addr_id = *ptr++; - pr_debug("ADD_ADDR%s: id=%d, echo=%d", - (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", - mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -269,6 +266,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } + pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", + (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", + mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port); break; case MPTCPOPT_RM_ADDR: -- cgit v1.2.3 From 42842a425ad6d1ef1087b63486879a6d54b26893 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:26 -0800 Subject: mptcp: drop rm_addr_signal flag This patch reused add_addr_signal for the RM_ADDR announcing signal, by defining a new ADD_ADDR status named MPTCP_RM_ADDR_SIGNAL. Then the flag rm_addr_signal in PM could be dropped. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm.c | 18 +++++++++++++++--- net/mptcp/protocol.h | 4 ++-- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e63e14f4cf2a..09d6e736161d 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -20,6 +20,11 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, pr_debug("msk=%p, local_id=%d", msk, addr->id); + if (add_addr) { + pr_warn("addr_signal error, add_addr=%d", add_addr); + return -EINVAL; + } + msk->pm.local = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); if (echo) @@ -34,10 +39,18 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) { + u8 rm_addr = READ_ONCE(msk->pm.add_addr_signal); + pr_debug("msk=%p, local_id=%d", msk, local_id); + if (rm_addr) { + pr_warn("addr_signal error, rm_addr=%d", rm_addr); + return -EINVAL; + } + msk->pm.rm_id = local_id; - WRITE_ONCE(msk->pm.rm_addr_signal, true); + rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); + WRITE_ONCE(msk->pm.add_addr_signal, rm_addr); return 0; } @@ -231,7 +244,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, goto out_unlock; *rm_id = msk->pm.rm_id; - WRITE_ONCE(msk->pm.rm_addr_signal, false); + WRITE_ONCE(msk->pm.add_addr_signal, 0); ret = true; out_unlock: @@ -253,7 +266,6 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.rm_id = 0; WRITE_ONCE(msk->pm.work_pending, false); WRITE_ONCE(msk->pm.add_addr_signal, 0); - WRITE_ONCE(msk->pm.rm_addr_signal, false); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); msk->pm.status = 0; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e880fa802cdf..f002c12beb98 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -173,6 +173,7 @@ enum mptcp_add_addr_status { MPTCP_ADD_ADDR_ECHO, MPTCP_ADD_ADDR_IPV6, MPTCP_ADD_ADDR_PORT, + MPTCP_RM_ADDR_SIGNAL, }; struct mptcp_pm_data { @@ -183,7 +184,6 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ u8 add_addr_signal; - bool rm_addr_signal; bool server_side; bool work_pending; bool accept_addr; @@ -578,7 +578,7 @@ static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.rm_addr_signal); + return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) -- cgit v1.2.3 From 13ad9f01a29e3f458fb3b319fb53323b2b0d1e68 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:27 -0800 Subject: mptcp: rename add_addr_signal and mptcp_add_addr_status Since the RM_ADDR signal had been reused with add_addr_signal, it's not suitable to call it add_addr_signal or mptcp_add_addr_status. So this patch renamed add_addr_signal to addr_signal, and renamed mptcp_add_addr_status to mptcp_addr_signal_status. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm.c | 14 +++++++------- net/mptcp/pm_netlink.c | 4 ++-- net/mptcp/protocol.h | 14 +++++++------- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 09d6e736161d..9256bd5d02ed 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -16,7 +16,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo, bool port) { - u8 add_addr = READ_ONCE(msk->pm.add_addr_signal); + u8 add_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, local_id=%d", msk, addr->id); @@ -33,13 +33,13 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); if (port) add_addr |= BIT(MPTCP_ADD_ADDR_PORT); - WRITE_ONCE(msk->pm.add_addr_signal, add_addr); + WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) { - u8 rm_addr = READ_ONCE(msk->pm.add_addr_signal); + u8 rm_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, local_id=%d", msk, local_id); @@ -50,7 +50,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) msk->pm.rm_id = local_id; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); - WRITE_ONCE(msk->pm.add_addr_signal, rm_addr); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); return 0; } @@ -221,7 +221,7 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, goto out_unlock; *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.add_addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; out_unlock: @@ -244,7 +244,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, goto out_unlock; *rm_id = msk->pm.rm_id; - WRITE_ONCE(msk->pm.add_addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; out_unlock: @@ -265,7 +265,7 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.subflows = 0; msk->pm.rm_id = 0; WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.add_addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); msk->pm.status = 0; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2560c502356b..46da9f8c9cba 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -431,12 +431,12 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) release_sock(ssk); spin_lock_bh(&msk->pm.lock); - add_addr = READ_ONCE(msk->pm.add_addr_signal); + add_addr = READ_ONCE(msk->pm.addr_signal); if (mptcp_pm_should_add_signal_ipv6(msk)) add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); if (mptcp_pm_should_add_signal_port(msk)) add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT); - WRITE_ONCE(msk->pm.add_addr_signal, add_addr); + WRITE_ONCE(msk->pm.addr_signal, add_addr); } } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f002c12beb98..9ff6fd486db6 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -168,7 +168,7 @@ enum mptcp_pm_status { MPTCP_PM_SUBFLOW_ESTABLISHED, }; -enum mptcp_add_addr_status { +enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_ADD_ADDR_IPV6, @@ -183,7 +183,7 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ - u8 add_addr_signal; + u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; @@ -558,27 +558,27 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); } static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) -- cgit v1.2.3 From 432d9e74d8a303fc0e897392e7b8334ba222c5f8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 9 Dec 2020 15:51:28 -0800 Subject: mptcp: use the variable sk instead of open-coding Since the local variable sk has been defined, use it instead of open-coding. Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 46da9f8c9cba..5151cfcd6962 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct mptcp_pm_addr_entry *local; struct pm_nl_pernet *pernet; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, msk->pm.local_addr_max, @@ -399,7 +399,7 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) local.family = remote.family; spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect((struct sock *)msk, &local, &remote); + __mptcp_subflow_connect(sk, &local, &remote); spin_lock_bh(&msk->pm.lock); mptcp_pm_announce_addr(msk, &remote, true, use_port); -- cgit v1.2.3 From 5b950ff4331ddda6421b21a779ec23127e8e3eb8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Dec 2020 12:03:29 +0100 Subject: mptcp: link MPC subflow into msk only after accept Christoph reported the following splat: WARNING: CPU: 0 PID: 4615 at net/ipv4/inet_connection_sock.c:1031 inet_csk_listen_stop+0x8e8/0xad0 net/ipv4/inet_connection_sock.c:1031 Modules linked in: CPU: 0 PID: 4615 Comm: syz-executor.4 Not tainted 5.9.0 #37 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:inet_csk_listen_stop+0x8e8/0xad0 net/ipv4/inet_connection_sock.c:1031 Code: 03 00 00 00 e8 79 b2 3d ff e9 ad f9 ff ff e8 1f 76 ba fe be 02 00 00 00 4c 89 f7 e8 62 b2 3d ff e9 14 f9 ff ff e8 08 76 ba fe <0f> 0b e9 97 f8 ff ff e8 fc 75 ba fe be 03 00 00 00 4c 89 f7 e8 3f RSP: 0018:ffffc900037f7948 EFLAGS: 00010293 RAX: ffff88810a349c80 RBX: ffff888114ee1b00 RCX: ffffffff827b14cd RDX: 0000000000000000 RSI: ffffffff827b1c38 RDI: 0000000000000005 RBP: ffff88810a2a8000 R08: ffff88810a349c80 R09: fffff520006fef1f R10: 0000000000000003 R11: fffff520006fef1e R12: ffff888114ee2d00 R13: dffffc0000000000 R14: 0000000000000001 R15: ffff888114ee1d68 FS: 00007f2ac1945700(0000) GS:ffff88811b400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd44798bc0 CR3: 0000000109810002 CR4: 0000000000170ef0 Call Trace: __tcp_close+0xd86/0x1110 net/ipv4/tcp.c:2433 __mptcp_close_ssk+0x256/0x430 net/mptcp/protocol.c:1761 __mptcp_destroy_sock+0x49b/0x770 net/mptcp/protocol.c:2127 mptcp_close+0x62d/0x910 net/mptcp/protocol.c:2184 inet_release+0xe9/0x1f0 net/ipv4/af_inet.c:434 __sock_release+0xd2/0x280 net/socket.c:596 sock_close+0x15/0x20 net/socket.c:1277 __fput+0x276/0x960 fs/file_table.c:281 task_work_run+0x109/0x1d0 kernel/task_work.c:151 get_signal+0xe8f/0x1d40 kernel/signal.c:2561 arch_do_signal+0x88/0x1b60 arch/x86/kernel/signal.c:811 exit_to_user_mode_loop kernel/entry/common.c:161 [inline] exit_to_user_mode_prepare+0x9b/0xf0 kernel/entry/common.c:191 syscall_exit_to_user_mode+0x22/0x150 kernel/entry/common.c:266 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f2ac1254469 Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ff 49 2b 00 f7 d8 64 89 01 48 RSP: 002b:00007f2ac1944dc8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffbf RBX: 000000000069bf00 RCX: 00007f2ac1254469 RDX: 0000000000000000 RSI: 0000000000008982 RDI: 0000000000000003 RBP: 000000000069bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000069bf0c R13: 00007ffeb53f178f R14: 00000000004668b0 R15: 0000000000000003 After commit 0397c6d85f9c ("mptcp: keep unaccepted MPC subflow into join list"), the msk's workqueue and/or PM can touch the MPC subflow - and acquire its socket lock - even if it's still unaccepted. If the above event races with the relevant listener socket close, we can end-up with the above splat. This change addresses the issue delaying the MPC socket insertion in conn_list at accept time - that is, partially reverting the blamed commit. We must additionally ensure that mptcp_pm_fully_established() happens after accept() time, or the PM will not be able to handle properly such event - conn_list could be empty otherwise. In the receive path, we check the subflow list node to ensure it is out of the listener queue. Be sure client subflows do not match transiently such condition moving them into the join list earlier at creation time. Since we now have multiple mptcp_pm_fully_established() call sites from different code-paths, said helper can now race with itself. Use an additional PM status bit to avoid multiple notifications. Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/103 Fixes: 0397c6d85f9c ("mptcp: keep unaccepted MPC subflow into join list"), Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 7 ++++++- net/mptcp/pm.c | 8 +++++++- net/mptcp/protocol.c | 11 +++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 14 ++++++++++---- 5 files changed, 35 insertions(+), 6 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index d1b4c5d208a9..1ca60d9da3ef 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -802,7 +802,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, mptcp_subflow_fully_established(subflow, mp_opt); fully_established: - if (likely(subflow->pm_notified)) + /* if the subflow is not already linked into the conn_list, we can't + * notify the PM: this subflow is still on the listener queue + * and the PM possibly acquiring the subflow lock could race with + * the listener close + */ + if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 9256bd5d02ed..da2ed576f289 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -126,8 +126,14 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); - if (READ_ONCE(pm->work_pending)) + /* mptcp_pm_fully_established() can be invoked by multiple + * racing paths - accept() and check_fully_established() + * be sure to serve this event only once. + */ + if (READ_ONCE(pm->work_pending) && + !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); + msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 57213ff60f78..4e29dcf17ecd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3208,6 +3208,17 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, bool slowpath; slowpath = lock_sock_fast(newsk); + + /* PM/worker can now acquire the first subflow socket + * lock without racing with listener queue cleanup, + * we can notify it, if needed. + */ + subflow = mptcp_subflow_ctx(msk->first); + list_add(&subflow->node, &msk->conn_list); + sock_hold(msk->first); + if (mptcp_is_fully_established(newsk)) + mptcp_pm_fully_established(msk); + mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9ff6fd486db6..f6c3c686a34a 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -165,6 +165,7 @@ enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, + MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_SUBFLOW_ESTABLISHED, }; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5f5815a1665f..9b5a966b0041 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -614,8 +614,9 @@ create_child: */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); - /* link the newly created socket to the msk */ - mptcp_add_pending_subflow(mptcp_sk(new_msk), ctx); + /* record the newly created socket as the first msk + * subflow, but don't link it yet into conn_list + */ WRITE_ONCE(mptcp_sk(new_msk)->first, child); /* new mpc subflow takes ownership of the newly @@ -1148,13 +1149,18 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr); + mptcp_add_pending_subflow(msk, subflow); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) - goto failed; + goto failed_unlink; - mptcp_add_pending_subflow(msk, subflow); return err; +failed_unlink: + spin_lock_bh(&msk->join_list_lock); + list_del(&subflow->node); + spin_unlock_bh(&msk->join_list_lock); + failed: subflow->disposable = 1; sock_release(sf); -- cgit v1.2.3 From 0597d0f8e030d1a5e64708b0f3233209a8b5d39e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Dec 2020 12:03:30 +0100 Subject: mptcp: plug subflow context memory leak When a MPTCP listener socket is closed with unaccepted children pending, the ULP release callback will be invoked, but nobody will call into __mptcp_close_ssk() on the corresponding subflow. As a consequence, at ULP release time, the 'disposable' flag will be cleared and the subflow context memory will be leaked. This change addresses the issue always freeing the context if the subflow is still in the accept queue at ULP release time. Additionally, this fixes an incorrect code reference in the related comment. Note: this fix leverages the changes introduced by the previous commit. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9b5a966b0041..fefcaf497938 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1339,9 +1339,10 @@ static void subflow_ulp_release(struct sock *ssk) sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx - * alive, will be freed by mptcp_done() + * alive, will be freed by __mptcp_close_ssk(), + * when the subflow is still unaccepted */ - release = ctx->disposable; + release = ctx->disposable || list_empty(&ctx->node); sock_put(sk); } -- cgit v1.2.3 From d7b1bfd0832c1d005f571203306b6c50e9805150 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Dec 2020 12:03:31 +0100 Subject: mptcp: be careful on subflows shutdown When the workqueue disposes of the msk, the subflows can still receive some data from the peer after __mptcp_close_ssk() completes. The above could trigger a race between the msk receive path and the msk destruction. Acquiring the mptcp_data_lock() in __mptcp_destroy_sock() will not save the day: the rx path could be reached even after msk destruction completes. Instead use the subflow 'disposable' flag to prevent entering the msk receive path after __mptcp_close_ssk(). Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4e29dcf17ecd..2540d82742ac 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -701,6 +701,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) int sk_rbuf, ssk_rbuf; bool wake; + /* The peer can send data while we are shutting down this + * subflow at msk destruction time, but we must avoid enqueuing + * more data to the msk receive queue + */ + if (unlikely(subflow->disposable)) + return; + /* move_skbs_to_msk below can legitly clear the data_avail flag, * but we will need later to properly woke the reader, cache its * value @@ -2119,6 +2126,8 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, sock_orphan(ssk); } + subflow->disposable = 1; + /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; @@ -2126,8 +2135,7 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { kfree_rcu(subflow, rcu); } else { - /* otherwise ask tcp do dispose of ssk and subflow ctx */ - subflow->disposable = 1; + /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ -- cgit v1.2.3 From 3764b0c5651e34ceb3e7d5c75b6fd8e7b72112ac Mon Sep 17 00:00:00 2001 From: Nicolas Rybowski Date: Thu, 10 Dec 2020 14:24:58 -0800 Subject: mptcp: attach subflow socket to parent cgroup It has been observed that the kernel sockets created for the subflows (except the first one) are not in the same cgroup as their parents. That's because the additional subflows are created by kernel workers. This is a problem with eBPF programs attached to the parent's cgroup won't be executed for the children. But also with any other features of CGroup linked to a sk. This patch fixes this behaviour. As the subflow sockets are created by the kernel, we can't use 'mem_cgroup_sk_alloc' because of the current context being the one of the kworker. This is why we have to do low level memcg manipulation, if required. Suggested-by: Matthieu Baerts Suggested-by: Paolo Abeni Acked-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fefcaf497938..bf808f1fabe5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1167,6 +1167,30 @@ failed: return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1187,6 +1211,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ -- cgit v1.2.3 From 141694df6573b49aa4143c92556544b4b0bbda72 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Dec 2020 14:24:59 -0800 Subject: mptcp: remove address when netlink flushes addrs When the PM netlink flushes the addresses, invoke the remove address function mptcp_nl_remove_subflow_and_signal_addr to remove the addresses and the subflows. Since this function should not be invoked under lock, move __flush_addrs out of the pernet->lock. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5151cfcd6962..9cc4eefaf080 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -867,13 +867,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct pm_nl_pernet *pernet) +static void __flush_addrs(struct net *net, struct list_head *list) { - while (!list_empty(&pernet->local_addr_list)) { + while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; - cur = list_entry(pernet->local_addr_list.next, + cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); + mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); kfree_rcu(cur, rcu); } @@ -890,11 +891,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet) static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - __flush_addrs(pernet); + list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); spin_unlock_bh(&pernet->lock); + __flush_addrs(sock_net(skb->sk), &free_list); return 0; } @@ -1156,10 +1159,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net_generic(net, pm_nl_pernet_id)); + __flush_addrs(net, &pernet->local_addr_list); } } -- cgit v1.2.3 From ba34c3de71ced1582dee55f2fae8638a3655d957 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Dec 2020 14:25:01 -0800 Subject: mptcp: use MPTCPOPT_HMAC_LEN macro Use the macro MPTCPOPT_HMAC_LEN instead of a constant in struct mptcp_options_received. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f6c3c686a34a..a5bc9599ae5c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -119,7 +119,7 @@ struct mptcp_options_received { u32 token; u32 nonce; u64 thmac; - u8 hmac[20]; + u8 hmac[MPTCPOPT_HMAC_LEN]; u8 join_id; u8 use_map:1, dsn64:1, -- cgit v1.2.3 From ab82e996a1fa1b9ae514fa357d9ce8df62321157 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Dec 2020 14:25:02 -0800 Subject: mptcp: hold mptcp socket before calling tcp_done When processing options from tcp reset path its possible that tcp_done(ssk) drops the last reference on the mptcp socket which results in use-after-free. Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/mptcp') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf808f1fabe5..73e66a406d99 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -313,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) -- cgit v1.2.3 From 50c504a20a754ca37b5e1f4e660cd687769a7dca Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Dec 2020 14:25:04 -0800 Subject: mptcp: parse and act on incoming FASTCLOSE option parse the MPTCP FASTCLOSE subtype. If provided key matches the local one, schedule the work queue to close (with tcp reset) all subflows. The MPTCP socket moves to closed state immediately. Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 17 +++++++++++++++++ net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 4 ++++ 3 files changed, 54 insertions(+) (limited to 'net/mptcp') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1ca60d9da3ef..5e7d7755d1a6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -282,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -299,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; @@ -942,6 +953,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2540d82742ac..cb8b7adf218a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2217,6 +2217,36 @@ static bool mptcp_check_close_timeout(const struct sock *sk) return true; } +static void mptcp_check_fastclose(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = &msk->sk.icsk_inet.sk; + + if (likely(!READ_ONCE(msk->rcv_fastclose))) + return; + + mptcp_token_destroy(msk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(tcp_sk); + if (tcp_sk->sk_state != TCP_CLOSE) { + tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_set_state(tcp_sk, TCP_CLOSE); + } + release_sock(tcp_sk); + } + + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_shutdown = SHUTDOWN_MASK; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); + + mptcp_close_wake_up(sk); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2233,6 +2263,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + + mptcp_check_fastclose(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a5bc9599ae5c..7cf9d110b85f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -23,6 +23,7 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_FASTCLOSE BIT(9) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -58,6 +59,7 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 #define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_FASTCLOSE 12 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -110,6 +112,7 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + fastclose : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -237,6 +240,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct sock *ack_hint; -- cgit v1.2.3 From 1bc7327b5fea60328bf72cd702eca1defa2a5655 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Dec 2020 14:25:05 -0800 Subject: mptcp: pm: simplify select_local_address() There is no need to unconditionally acquire the join list lock, we can simply splice the join list into the subflow list and traverse only the latter. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 9cc4eefaf080..a6d983d80576 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, *ret = NULL; rcu_read_lock(); - spin_lock_bh(&msk->join_list_lock); + __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet, * pending join */ if (entry->addr.family == ((struct sock *)msk)->sk_family && - !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { ret = entry; break; } } - spin_unlock_bh(&msk->join_list_lock); rcu_read_unlock(); return ret; } -- cgit v1.2.3 From 15e6ca974b14c2dc4221738ef81b23ef694c9160 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Dec 2020 14:25:06 -0800 Subject: mptcp: let MPTCP create max size skbs Currently the xmit path of the MPTCP protocol creates smaller- than-max-size skbs, which is suboptimal for the performances. There are a few things to improve: - when coalescing to an existing skb, must clear the PUSH flag - tcp_build_frag() expect the available space as an argument. When coalescing is enable MPTCP already subtracted the to-be-coalesced skb len. We must increment said argument accordingly. Before: ./use_mptcp.sh netperf -H 127.0.0.1 -t TCP_STREAM [...] 131072 16384 16384 30.00 24414.86 After: ./use_mptcp.sh netperf -H 127.0.0.1 -t TCP_STREAM [...] 131072 16384 16384 30.05 28357.69 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/mptcp') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cb8b7adf218a..b812aaae8044 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1256,6 +1256,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; bool can_collapse = false; + int size_bias = 0; int avail_size; size_t ret = 0; @@ -1277,10 +1278,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext = skb_ext_find(skb, SKB_EXT_MPTCP); can_collapse = (info->size_goal - skb->len > 0) && mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) + if (!can_collapse) { TCP_SKB_CB(skb)->eor = 1; - else + } else { + size_bias = skb->len; avail_size = info->size_goal - skb->len; + } } /* Zero window and all data acked? Probe. */ @@ -1300,8 +1303,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return 0; ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, - dfrag->offset + info->sent, &ret); + tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, + dfrag->page, dfrag->offset + info->sent, &ret); if (!tail) { tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); return -ENOMEM; @@ -1310,8 +1313,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* if the tail skb is still the cached one, collapsing really happened. */ if (skb == tail) { - WARN_ON_ONCE(!can_collapse); + TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; + WARN_ON_ONCE(!can_collapse); WARN_ON_ONCE(zero_window_probe); goto out; } -- cgit v1.2.3