diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 570 |
1 files changed, 461 insertions, 109 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5d747c6a610e..185dacb39781 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -24,8 +24,6 @@ #include "protocol.h" #include "mib.h" -#define MPTCP_SAME_STATE TCP_MAX_STATES - #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; @@ -34,6 +32,8 @@ struct mptcp6_sock { #endif struct mptcp_skb_cb { + u64 map_seq; + u64 end_seq; u32 offset; }; @@ -112,64 +112,205 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) return 0; } -static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, - struct sk_buff *skb, - unsigned int offset, size_t copy_len) +static void mptcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + +static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from) +{ + bool fragstolen; + int delta; + + if (MPTCP_SKB_CB(from)->offset || + !skb_try_coalesce(to, from, &fragstolen, &delta)) + return false; + + pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", + MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, + to->len, MPTCP_SKB_CB(from)->end_seq); + MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; + kfree_skb_partial(from, fragstolen); + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + return true; +} + +static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, + struct sk_buff *from) +{ + if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) + return false; + + return mptcp_try_coalesce((struct sock *)msk, to, from); +} + +/* "inspired" by tcp_data_queue_ofo(), main differences: + * - use mptcp seqs + * - don't cope with sacks + */ +static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; - struct sk_buff *tail; + struct rb_node **p, *parent; + u64 seq, end_seq, max_seq; + struct sk_buff *skb1; + int space; + + seq = MPTCP_SKB_CB(skb)->map_seq; + end_seq = MPTCP_SKB_CB(skb)->end_seq; + space = tcp_space(sk); + max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; + + pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, + RB_EMPTY_ROOT(&msk->out_of_order_queue)); + if (after64(seq, max_seq)) { + /* out of window */ + mptcp_drop(sk, skb); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); + return; + } - __skb_unlink(skb, &ssk->sk_receive_queue); + p = &msk->out_of_order_queue.rb_node; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); + if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); + msk->ooo_last_skb = skb; + goto end; + } - skb_ext_reset(skb); - skb_orphan(skb); - WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); + /* with 2 subflows, adding at end of ooo queue is quite likely + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); + return; + } - tail = skb_peek_tail(&sk->sk_receive_queue); - if (offset == 0 && tail) { - bool fragstolen; - int delta; + /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ + if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); + parent = &msk->ooo_last_skb->rbnode; + p = &parent->rb_right; + goto insert; + } - if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { - kfree_skb_partial(skb, fragstolen); - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_to_skb(parent); + if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { + p = &parent->rb_left; + continue; + } + if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { + if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + mptcp_drop(sk, skb); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + return; + } + if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { + /* partial overlap: + * | skb | + * | skb1 | + * continue traversing + */ + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &msk->out_of_order_queue); + mptcp_drop(sk, skb1); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + goto merge_right; + } + } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } + p = &parent->rb_right; } - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); - MPTCP_SKB_CB(skb)->offset = offset; -} +insert: + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); -static void mptcp_stop_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); +merge_right: + /* Remove other segments covered by skb. */ + while ((skb1 = skb_rb_next(skb)) != NULL) { + if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) + break; + rb_erase(&skb1->rbnode, &msk->out_of_order_queue); + mptcp_drop(sk, skb1); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + } + /* If there is no skb after us, we are the last_skb ! */ + if (!skb1) + msk->ooo_last_skb = skb; - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - mptcp_sk(sk)->timer_ival = 0; +end: + skb_condense(skb); + skb_set_owner_r(skb, sk); } -/* both sockets must be locked */ -static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, - struct sock *ssk) +static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, + struct sk_buff *skb, unsigned int offset, + size_t copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); + struct sock *sk = (struct sock *)msk; + struct sk_buff *tail; - /* revalidate data sequence number. - * - * mptcp_subflow_data_available() is usually called - * without msk lock. Its unlikely (but possible) - * that msk->ack_seq has been advanced since the last - * call found in-sequence data. + __skb_unlink(skb, &ssk->sk_receive_queue); + + skb_ext_reset(skb); + skb_orphan(skb); + + /* the skb map_seq accounts for the skb offset: + * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq + * value */ - if (likely(dsn == msk->ack_seq)) + MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); + MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; + MPTCP_SKB_CB(skb)->offset = offset; + + if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { + /* in sequence */ + WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); + tail = skb_peek_tail(&sk->sk_receive_queue); + if (tail && mptcp_try_coalesce(sk, tail, skb)) + return true; + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); return true; + } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { + mptcp_data_queue_ofo(msk, skb); + return false; + } - subflow->data_avail = 0; - return mptcp_subflow_data_available(ssk); + /* old data, keep it simple and drop the whole pkt, sender + * will retransmit as needed, if needed. + */ + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + mptcp_drop(sk, skb); + return false; +} + +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; } static void mptcp_check_data_fin_ack(struct sock *sk) @@ -313,14 +454,12 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; + u32 old_copied_seq; bool done = false; - if (!mptcp_subflow_dsn_valid(msk, ssk)) { - *bytes = 0; - return false; - } - + pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); + old_copied_seq = tp->copied_seq; do { u32 map_remaining, offset; u32 seq = tp->copied_seq; @@ -332,8 +471,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); - if (!skb) + if (!skb) { + /* if no data is found, a racing workqueue/recvmsg + * already processed the new data, stop here or we + * can enter an infinite loop + */ + if (!moved) + done = true; break; + } if (__mptcp_check_fallback(msk)) { /* if we are running under the workqueue, TCP could have @@ -357,9 +503,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (tp->urg_data) done = true; - __mptcp_move_skb(msk, ssk, skb, offset, len); + if (__mptcp_move_skb(msk, ssk, skb, offset, len)) + moved += len; seq += len; - moved += len; if (WARN_ON_ONCE(map_remaining < len)) break; @@ -378,20 +524,56 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } while (more_data_avail); - *bytes = moved; - - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); + *bytes += moved; + if (tp->copied_seq != old_copied_seq) + tcp_cleanup_rbuf(ssk, 1); return done; } +static bool mptcp_ofo_queue(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct sk_buff *skb, *tail; + bool moved = false; + struct rb_node *p; + u64 end_seq; + + p = rb_first(&msk->out_of_order_queue); + pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); + while (p) { + skb = rb_to_skb(p); + if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) + break; + + p = rb_next(p); + rb_erase(&skb->rbnode, &msk->out_of_order_queue); + + if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, + msk->ack_seq))) { + mptcp_drop(sk, skb); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + continue; + } + + end_seq = MPTCP_SKB_CB(skb)->end_seq; + tail = skb_peek_tail(&sk->sk_receive_queue); + if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { + int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + + /* skip overlapping data, if any */ + pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", + MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, + delta); + MPTCP_SKB_CB(skb)->offset += delta; + __skb_queue_tail(&sk->sk_receive_queue, skb); + } + msk->ack_seq = end_seq; + moved = true; + } + return moved; +} + /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ @@ -407,8 +589,19 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return false; /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) + if (!READ_ONCE(sk->sk_lock.owned)) { __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + mptcp_ofo_queue(msk); + + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL) && + schedule_work(&msk->work)) + sock_hold(sk); + } spin_unlock_bh(&sk->sk_lock.slock); @@ -417,9 +610,17 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) void mptcp_data_ready(struct sock *sk, struct sock *ssk) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + bool wake; - set_bit(MPTCP_DATA_READY, &msk->flags); + /* move_skbs_to_msk below can legitly clear the data_avail flag, + * but we will need later to properly woke the reader, cache its + * value + */ + wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL; + if (wake) + set_bit(MPTCP_DATA_READY, &msk->flags); if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && move_skbs_to_msk(msk, ssk)) @@ -440,7 +641,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) move_skbs_to_msk(msk, ssk); } wake: - sk->sk_data_ready(sk); + if (wake) + sk->sk_data_ready(sk); } static void __mptcp_flush_join_list(struct mptcp_sock *msk) @@ -474,7 +676,7 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if ((!sk_stream_is_writeable(sk) || + if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && schedule_work(&mptcp_sk(sk)->work)) sock_hold(sk); @@ -569,6 +771,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } +static bool mptcp_is_writeable(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!sk_stream_is_writeable((struct sock *)msk)) + return false; + + mptcp_for_each_subflow(msk, subflow) { + if (sk_stream_is_writeable(subflow->tcp_sock)) + return true; + } + return false; +} + static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -611,8 +827,15 @@ out: sk_mem_reclaim_partial(sk); /* Only wake up writers if a subflow is ready */ - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) + if (mptcp_is_writeable(msk)) { + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); + smp_mb__after_atomic(); + + /* set SEND_SPACE before sk_stream_write_space clears + * NOSPACE + */ sk_stream_write_space(sk); + } } } @@ -803,60 +1026,128 @@ out: return ret; } -static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) +static void mptcp_nospace(struct mptcp_sock *msk) { + struct mptcp_subflow_context *subflow; + clear_bit(MPTCP_SEND_SPACE, &msk->flags); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - /* enables sk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct socket *sock = READ_ONCE(ssk->sk_socket); + + /* enables ssk->write_space() callbacks */ + if (sock) + set_bit(SOCK_NOSPACE, &sock->flags); + } +} + +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ + sizeof(struct tcphdr) - \ + MAX_TCP_OPTION_SPACE - \ + sizeof(struct ipv6hdr) - \ + sizeof(struct frag_hdr)) + +struct subflow_send_info { + struct sock *ssk; + u64 ratio; +}; + +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, + u32 *sndbuf) { + struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; - struct sock *backup = NULL; + int i, nr_active = 0; + struct sock *ssk; + u64 ratio; + u32 pace; - sock_owned_by_me((const struct sock *)msk); + sock_owned_by_me((struct sock *)msk); + *sndbuf = 0; if (!mptcp_ext_cache_refill(msk)) return NULL; + if (__mptcp_check_fallback(msk)) { + if (!msk->first) + return NULL; + *sndbuf = msk->first->sk_sndbuf; + return sk_stream_memory_free(msk->first) ? msk->first : NULL; + } + + /* re-use last subflow, if the burst allow that */ + if (msk->last_snd && msk->snd_burst > 0 && + sk_stream_memory_free(msk->last_snd) && + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); + } + return msk->last_snd; + } + + /* pick the subflow with the lower wmem/wspace ratio */ + for (i = 0; i < 2; ++i) { + send_info[i].ssk = NULL; + send_info[i].ratio = -1; + } mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; - if (!sk_stream_memory_free(ssk)) { - struct socket *sock = ssk->sk_socket; + nr_active += !subflow->backup; + *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); + if (!sk_stream_memory_free(subflow->tcp_sock)) + continue; - if (sock) - mptcp_nospace(msk, sock); + pace = READ_ONCE(ssk->sk_pacing_rate); + if (!pace) + continue; - return NULL; + ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, + pace); + if (ratio < send_info[subflow->backup].ratio) { + send_info[subflow->backup].ssk = ssk; + send_info[subflow->backup].ratio = ratio; } + } - if (subflow->backup) { - if (!backup) - backup = ssk; + pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", + msk, nr_active, send_info[0].ssk, send_info[0].ratio, + send_info[1].ssk, send_info[1].ratio); - continue; - } + /* pick the best backup if no other subflow is active */ + if (!nr_active) + send_info[0].ssk = send_info[1].ssk; - return ssk; + if (send_info[0].ssk) { + msk->last_snd = send_info[0].ssk; + msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, + sk_stream_wspace(msk->last_snd)); + return msk->last_snd; } - - return backup; + return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) +static void ssk_check_wmem(struct mptcp_sock *msk) { - struct socket *sock; - - if (likely(sk_stream_is_writeable(ssk))) - return; - - sock = READ_ONCE(ssk->sk_socket); - if (sock) - mptcp_nospace(msk, sock); + if (unlikely(!mptcp_is_writeable(msk))) + mptcp_nospace(msk); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) @@ -866,6 +1157,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct page_frag *pfrag; size_t copied = 0; struct sock *ssk; + u32 sndbuf; bool tx_ok; long timeo; @@ -892,7 +1184,7 @@ restart: } __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); while (!sk_stream_memory_free(sk) || !ssk || !mptcp_page_frag_refill(ssk, pfrag)) { @@ -909,19 +1201,25 @@ restart: mptcp_reset_timer(sk); } + mptcp_nospace(msk); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; mptcp_clean_una(sk); - ssk = mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); if (list_empty(&msk->conn_list)) { ret = -ENOTCONN; goto out; } } + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, sndbuf); + pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); @@ -938,6 +1236,10 @@ restart: break; } + /* burst can be negative, we will try move to the next subflow + * at selection time, if possible. + */ + msk->snd_burst -= ret; copied += ret; tx_ok = msg_data_left(msg); @@ -947,7 +1249,6 @@ restart: if (!sk_stream_memory_free(ssk) || !mptcp_page_frag_refill(ssk, pfrag) || !mptcp_ext_cache_refill(msk)) { - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); mptcp_set_timeout(sk, ssk); @@ -995,9 +1296,9 @@ restart: mptcp_reset_timer(sk); } - ssk_check_wmem(msk, ssk); release_sock(ssk); out: + ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1135,10 +1436,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; + bool slow; ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); tcp_sk(ssk)->window_clamp = window_clamp; + tcp_cleanup_rbuf(ssk, 1); + unlock_sock_fast(ssk, slow); } } } @@ -1154,6 +1459,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool done; + /* avoid looping forever below on racing close */ + if (((struct sock *)msk)->sk_state == TCP_CLOSE) + return false; + + __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); @@ -1165,7 +1475,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) release_sock(ssk); } while (!done); - return moved > 0; + if (mptcp_ofo_queue(msk) || moved > 0) { + mptcp_check_data_fin((struct sock *)msk); + return true; + } + return false; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1261,6 +1575,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, set_bit(MPTCP_DATA_READY, &msk->flags); } out_err: + pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", + msk, test_bit(MPTCP_DATA_READY, &msk->flags), + skb_queue_empty(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -1311,9 +1628,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); + if (__mptcp_check_fallback(msk)) + return msk->first; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; + /* still data outstanding at TCP level? Don't retransmit. */ if (!tcp_write_queue_empty(ssk)) return NULL; @@ -1338,9 +1661,9 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ -static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout) +void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + long timeout) { struct socket *sock = READ_ONCE(ssk->sk_socket); @@ -1371,6 +1694,10 @@ static void pm_work(struct mptcp_sock *msk) pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } + if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); + mptcp_pm_nl_rm_addr_received(msk); + } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); mptcp_pm_nl_fully_established(msk); @@ -1383,6 +1710,20 @@ static void pm_work(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static void __mptcp_close_subflow(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (inet_sk_state_load(ssk) != TCP_CLOSE) + continue; + + __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1400,6 +1741,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_clean_una(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + __mptcp_move_skbs(msk); if (msk->pm.status) @@ -1474,6 +1818,7 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); + msk->out_of_order_queue = RB_ROOT; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -1491,23 +1836,23 @@ static int mptcp_init_sock(struct sock *sk) struct net *net = sock_net(sk); int ret; + ret = __mptcp_init_sock(sk); + if (ret) + return ret; + if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; - ret = __mptcp_init_sock(sk); - if (ret) - return ret; - ret = __mptcp_socket_create(mptcp_sk(sk)); if (ret) return ret; sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; return 0; } @@ -1531,7 +1876,7 @@ static void mptcp_cancel_work(struct sock *sk) sock_put(sk); } -static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) +void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); @@ -1809,14 +2154,21 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return newsk; } +void mptcp_destroy_common(struct mptcp_sock *msk) +{ + skb_rbtree_purge(&msk->out_of_order_queue); + mptcp_token_destroy(msk); + mptcp_pm_free_anno_list(msk); +} + static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_token_destroy(msk); if (msk->cached_ext) __skb_ext_put(msk->cached_ext); + mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } @@ -2288,13 +2640,13 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); + pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) return mptcp_check_readable(msk); if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (sk_stream_is_writeable(sk) && - test_bit(MPTCP_SEND_SPACE, &msk->flags)) + if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown & RCV_SHUTDOWN) |