summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/options.c33
-rw-r--r--net/mptcp/protocol.c168
-rw-r--r--net/mptcp/protocol.h1
-rw-r--r--net/mptcp/subflow.c58
4 files changed, 196 insertions, 64 deletions
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 7793b6011fa7..01f1f4cf4902 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -517,7 +517,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return ret;
}
- ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+ if (subflow->use_64bit_ack) {
+ ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+ opts->ext_copy.data_ack = msk->ack_seq;
+ opts->ext_copy.ack64 = 1;
+ } else {
+ ack_size = TCPOLEN_MPTCP_DSS_ACK32;
+ opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq);
+ opts->ext_copy.ack64 = 0;
+ }
+ opts->ext_copy.use_ack = 1;
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
@@ -525,10 +534,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
dss_size += ack_size;
- opts->ext_copy.data_ack = msk->ack_seq;
- opts->ext_copy.ack64 = 1;
- opts->ext_copy.use_ack = 1;
-
*size = ALIGN(dss_size, 4);
return true;
}
@@ -987,8 +992,13 @@ mp_capable_done:
u8 flags = 0;
if (mpext->use_ack) {
- len += TCPOLEN_MPTCP_DSS_ACK64;
- flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+ flags = MPTCP_DSS_HAS_ACK;
+ if (mpext->ack64) {
+ len += TCPOLEN_MPTCP_DSS_ACK64;
+ flags |= MPTCP_DSS_ACK64;
+ } else {
+ len += TCPOLEN_MPTCP_DSS_ACK32;
+ }
}
if (mpext->use_map) {
@@ -1005,8 +1015,13 @@ mp_capable_done:
*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
- put_unaligned_be64(mpext->data_ack, ptr);
- ptr += 2;
+ if (mpext->ack64) {
+ put_unaligned_be64(mpext->data_ack, ptr);
+ ptr += 2;
+ } else {
+ put_unaligned_be32(mpext->data_ack32, ptr);
+ ptr += 1;
+ }
}
if (mpext->use_map) {
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 34dd0e278a82..14b253d10ccf 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -144,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
unsigned int offset, size_t copy_len)
{
struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
__skb_unlink(skb, &ssk->sk_receive_queue);
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_ext_reset(skb);
+ skb_orphan(skb);
msk->ack_seq += copy_len;
+
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (offset == 0 && tail) {
+ bool fragstolen;
+ int delta;
+
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ kfree_skb_partial(skb, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return;
+ }
+ }
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
MPTCP_SKB_CB(skb)->offset = offset;
}
@@ -367,8 +384,10 @@ static void mptcp_stop_timer(struct sock *sk)
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
+ const struct sock *sk = (const struct sock *)msk;
+
if (!msk->cached_ext)
- msk->cached_ext = __skb_ext_alloc();
+ msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
return !!msk->cached_ext;
}
@@ -510,20 +529,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* fooled into a warning if we don't init here
*/
pfrag = sk_page_frag(sk);
- while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
- !mptcp_ext_cache_refill(msk)) {
- ret = sk_stream_wait_memory(ssk, timeo);
- if (ret)
- return ret;
-
- /* if sk_stream_wait_memory() sleeps snd_una can change
- * significantly, refresh the rtx queue
- */
- mptcp_clean_una(sk);
-
- if (unlikely(__mptcp_needs_tcp_fallback(msk)))
- return 0;
- }
if (!retransmission) {
write_seq = &msk->write_seq;
page = pfrag->page;
@@ -590,7 +595,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* access the skb after the sendpages call
*/
ret = do_tcp_sendpages(ssk, page, offset, psize,
- msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+ msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
if (ret <= 0)
return ret;
@@ -653,6 +658,15 @@ out:
return ret;
}
+static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+{
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+
+ /* enables sk->write_space() callbacks */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+}
+
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -660,19 +674,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (!mptcp_ext_cache_refill(msk))
+ return NULL;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!sk_stream_memory_free(ssk)) {
struct socket *sock = ssk->sk_socket;
- if (sock) {
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
-
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
+ if (sock)
+ mptcp_nospace(msk, sock);
return NULL;
}
@@ -698,22 +710,19 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
return;
sock = READ_ONCE(ssk->sk_socket);
-
- if (sock) {
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
- /* set NOSPACE only after clearing SEND_SPACE flag */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
+ if (sock)
+ mptcp_nospace(msk, sock);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct page_frag *pfrag;
struct socket *ssock;
size_t copied = 0;
struct sock *ssk;
+ bool tx_ok;
long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
@@ -738,11 +747,29 @@ fallback:
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}
+ pfrag = sk_page_frag(sk);
+restart:
mptcp_clean_una(sk);
+wait_for_sndbuf:
__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
- while (!sk_stream_memory_free(sk) || !ssk) {
+ while (!sk_stream_memory_free(sk) ||
+ !ssk ||
+ !mptcp_page_frag_refill(ssk, pfrag)) {
+ if (ssk) {
+ /* make sure retransmit timer is
+ * running before we wait for memory.
+ *
+ * The retransmit timer might be needed
+ * to make the peer send an up-to-date
+ * MPTCP Ack.
+ */
+ mptcp_set_timeout(sk, ssk);
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ }
+
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
@@ -759,11 +786,18 @@ fallback:
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
- while (msg_data_left(msg)) {
+ tx_ok = msg_data_left(msg);
+ while (tx_ok) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -EAGAIN && timeo > 0) {
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto restart;
+ }
break;
+ }
if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
/* Can happen for passive sockets:
* 3WHS negotiated MPTCP, but first packet after is
@@ -777,6 +811,50 @@ fallback:
}
copied += ret;
+
+ tx_ok = msg_data_left(msg);
+ if (!tx_ok)
+ break;
+
+ if (!sk_stream_memory_free(ssk) ||
+ !mptcp_page_frag_refill(ssk, pfrag) ||
+ !mptcp_ext_cache_refill(msk)) {
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ tcp_push(ssk, msg->msg_flags, mss_now,
+ tcp_sk(ssk)->nonagle, size_goal);
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto restart;
+ }
+
+ /* memory is charged to mptcp level socket as well, i.e.
+ * if msg is very large, mptcp socket may run out of buffer
+ * space. mptcp_clean_una() will release data that has
+ * been acked at mptcp level in the mean time, so there is
+ * a good chance we can continue sending data right away.
+ *
+ * Normally, when the tcp subflow can accept more data, then
+ * so can the MPTCP socket. However, we need to cope with
+ * peers that might lag behind in their MPTCP-level
+ * acknowledgements, i.e. data might have been acked at
+ * tcp level only. So, we must also check the MPTCP socket
+ * limits before we send more data.
+ */
+ if (unlikely(!sk_stream_memory_free(sk))) {
+ tcp_push(ssk, msg->msg_flags, mss_now,
+ tcp_sk(ssk)->nonagle, size_goal);
+ mptcp_clean_una(sk);
+ if (!sk_stream_memory_free(sk)) {
+ /* can't send more for now, need to wait for
+ * MPTCP-level ACKs from peer.
+ *
+ * Wakeup will happen via mptcp_clean_una().
+ */
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto wait_for_sndbuf;
+ }
+ }
}
mptcp_set_timeout(sk, ssk);
@@ -1095,7 +1173,7 @@ static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
- int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+ int orig_len, orig_offset, mss_now = 0, size_goal = 0;
struct mptcp_data_frag *dfrag;
u64 orig_write_seq;
size_t copied = 0;
@@ -1117,6 +1195,9 @@ static void mptcp_worker(struct work_struct *work)
if (!dfrag)
goto unlock;
+ if (!mptcp_ext_cache_refill(msk))
+ goto reset_unlock;
+
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
goto reset_unlock;
@@ -1128,8 +1209,8 @@ static void mptcp_worker(struct work_struct *work)
orig_offset = dfrag->offset;
orig_write_seq = dfrag->data_seq;
while (dfrag->data_len > 0) {
- ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
- &size_goal);
+ int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
+ &mss_now, &size_goal);
if (ret < 0)
break;
@@ -1137,6 +1218,9 @@ static void mptcp_worker(struct work_struct *work)
copied += ret;
dfrag->data_len -= ret;
dfrag->offset += ret;
+
+ if (!mptcp_ext_cache_refill(msk))
+ break;
}
if (copied)
tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
@@ -1653,13 +1737,6 @@ bool mptcp_finish_join(struct sock *sk)
return true;
}
-bool mptcp_sk_is_subflow(const struct sock *sk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-
- return subflow->mp_join == 1;
-}
-
static bool mptcp_memory_free(const struct sock *sk, int wake)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2037,6 +2114,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
+ .compat_ioctl = inet6_compat_ioctl,
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d0803dfb8108..809687d3f410 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -289,6 +289,7 @@ struct mptcp_subflow_context {
data_avail : 1,
rx_eof : 1,
data_fin_tx_enable : 1,
+ use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
can_ack : 1; /* only after processing the remote a key */
u64 data_fin_tx_seq;
u32 remote_nonce;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 8968b2c065e7..493b98a0825c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -413,6 +413,20 @@ static void subflow_ulp_fallback(struct sock *sk,
tcp_sk(sk)->is_mptcp = 0;
}
+static void subflow_drop_ctx(struct sock *ssk)
+{
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+
+ if (!ctx)
+ return;
+
+ subflow_ulp_fallback(ssk, ctx);
+ if (ctx->conn)
+ sock_put(ctx->conn);
+
+ kfree_rcu(ctx, rcu);
+}
+
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
@@ -475,18 +489,17 @@ create_child:
if (child && *own_req) {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
+ tcp_rsk(req)->drop_req = false;
+
/* we need to fallback on ctx allocation failure and on pre-reqs
* checking above. In the latter scenario we additionally need
* to reset the context to non MPTCP status.
*/
if (!ctx || fallback) {
if (fallback_is_fatal)
- goto close_child;
+ goto dispose_child;
- if (ctx) {
- subflow_ulp_fallback(child, ctx);
- kfree_rcu(ctx, rcu);
- }
+ subflow_drop_ctx(child);
goto out;
}
@@ -510,13 +523,14 @@ create_child:
owner = mptcp_token_get_sock(ctx->token);
if (!owner)
- goto close_child;
+ goto dispose_child;
ctx->conn = (struct sock *)owner;
if (!mptcp_finish_join(child))
- goto close_child;
+ goto dispose_child;
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
+ tcp_rsk(req)->drop_req = true;
}
}
@@ -533,11 +547,15 @@ out:
!mptcp_subflow_ctx(child)->conn));
return child;
-close_child:
+dispose_child:
+ subflow_drop_ctx(child);
+ tcp_rsk(req)->drop_req = true;
tcp_send_active_reset(child, GFP_ATOMIC);
- inet_csk_prepare_forced_close(child);
+ inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
- return NULL;
+
+ /* The last child reference will be released by the caller */
+ return child;
}
static struct inet_connection_sock_af_ops subflow_specific;
@@ -666,9 +684,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (!mpext->dsn64) {
map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
mpext->data_seq);
+ subflow->use_64bit_ack = 0;
pr_debug("expanded seq=%llu", subflow->map_seq);
} else {
map_seq = mpext->data_seq;
+ subflow->use_64bit_ack = 1;
}
if (subflow->map_valid) {
@@ -850,6 +870,24 @@ bool mptcp_subflow_data_available(struct sock *sk)
return subflow->data_avail;
}
+/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
+ * not the ssk one.
+ *
+ * In mptcp, rwin is about the mptcp-level connection data.
+ *
+ * Data that is still on the ssk rx queue can thus be ignored,
+ * as far as mptcp peer is concerened that data is still inflight.
+ * DSS ACK is updated when skb is moved to the mptcp rx queue.
+ */
+void mptcp_space(const struct sock *ssk, int *space, int *full_space)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ const struct sock *sk = subflow->conn;
+
+ *space = tcp_space(sk);
+ *full_space = tcp_full_space(sk);
+}
+
static void subflow_data_ready(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);