summaryrefslogtreecommitdiffstats
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2020-09-14 10:01:17 +0200
committerDavid S. Miller <davem@davemloft.net>2020-09-14 13:28:02 -0700
commitd5f49190def61c47b2faff170ba8fbc48bac4371 (patch)
tree32cb6e5177d7f54d34e9611758c2d7117f30e5bb /net/mptcp/protocol.c
parent4596a2c1b7f55b8dac9ef4d8724cbef6e11eb74c (diff)
downloadlinux-d5f49190def61c47b2faff170ba8fbc48bac4371.tar.bz2
mptcp: allow picking different xmit subflows
Update the scheduler to less trivial heuristic: cache the last used subflow, and try to send on it a reasonably long burst of data. When the burst or the subflow send space is exhausted, pick the subflow with the lower ratio between write space and send buffer - that is, the subflow with the greater relative amount of free space. v1 -> v2: - fix 32 bit build breakage due to 64bits div - fix checkpath issues (uint64_t -> u64) Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c111
1 files changed, 95 insertions, 16 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index ec9c38d3acc7..d7af96a900c4 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk)
}
}
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ /* only send if our side has not closed yet */
+ return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
+
+struct subflow_send_info {
+ struct sock *ssk;
+ u64 ratio;
+};
+
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
u32 *sndbuf)
{
+ struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- struct sock *backup = NULL;
- bool free;
+ int i, nr_active = 0;
+ struct sock *ssk;
+ u64 ratio;
+ u32 pace;
- sock_owned_by_me(sk);
+ sock_owned_by_me((struct sock *)msk);
*sndbuf = 0;
if (!mptcp_ext_cache_refill(msk))
return NULL;
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- free = sk_stream_is_writeable(subflow->tcp_sock);
- if (!free) {
- mptcp_nospace(msk);
+ if (__mptcp_check_fallback(msk)) {
+ if (!msk->first)
return NULL;
+ *sndbuf = msk->first->sk_sndbuf;
+ return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ }
+
+ /* re-use last subflow, if the burst allow that */
+ if (msk->last_snd && msk->snd_burst > 0 &&
+ sk_stream_memory_free(msk->last_snd) &&
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
}
+ return msk->last_snd;
+ }
+
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < 2; ++i) {
+ send_info[i].ssk = NULL;
+ send_info[i].ratio = -1;
+ }
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+ nr_active += !subflow->backup;
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
- if (subflow->backup) {
- if (!backup)
- backup = ssk;
+ if (!sk_stream_memory_free(subflow->tcp_sock))
+ continue;
+ pace = READ_ONCE(ssk->sk_pacing_rate);
+ if (!pace)
continue;
- }
- return ssk;
+ ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
+ pace);
+ if (ratio < send_info[subflow->backup].ratio) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].ratio = ratio;
+ }
}
- return backup;
+ pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
+ msk, nr_active, send_info[0].ssk, send_info[0].ratio,
+ send_info[1].ssk, send_info[1].ratio);
+
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[0].ssk = send_info[1].ssk;
+
+ if (send_info[0].ssk) {
+ msk->last_snd = send_info[0].ssk;
+ msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
+ sk_stream_wspace(msk->last_snd));
+ return msk->last_snd;
+ }
+ return NULL;
}
static void ssk_check_wmem(struct mptcp_sock *msk)
@@ -1160,6 +1224,10 @@ restart:
break;
}
+ /* burst can be negative, we will try move to the next subflow
+ * at selection time, if possible.
+ */
+ msk->snd_burst -= ret;
copied += ret;
tx_ok = msg_data_left(msg);
@@ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool done;
+ /* avoid looping forever below on racing close */
+ if (((struct sock *)msk)->sk_state == TCP_CLOSE)
+ return false;
+
+ __mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
@@ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (__mptcp_check_fallback(msk))
+ return msk->first;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
/* still data outstanding at TCP level? Don't retransmit. */
if (!tcp_write_queue_empty(ssk))
return NULL;