summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Kconfig26
-rw-r--r--net/mptcp/Makefile4
-rw-r--r--net/mptcp/crypto.c152
-rw-r--r--net/mptcp/ctrl.c130
-rw-r--r--net/mptcp/options.c586
-rw-r--r--net/mptcp/protocol.c1276
-rw-r--r--net/mptcp/protocol.h240
-rw-r--r--net/mptcp/subflow.c860
-rw-r--r--net/mptcp/token.c195
9 files changed, 3469 insertions, 0 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
new file mode 100644
index 000000000000..5db56d2218c5
--- /dev/null
+++ b/net/mptcp/Kconfig
@@ -0,0 +1,26 @@
+
+config MPTCP
+ bool "MPTCP: Multipath TCP"
+ depends on INET
+ select SKB_EXTENSIONS
+ select CRYPTO_LIB_SHA256
+ help
+ Multipath TCP (MPTCP) connections send and receive data over multiple
+ subflows in order to utilize multiple network paths. Each subflow
+ uses the TCP protocol, and TCP options carry header information for
+ MPTCP.
+
+config MPTCP_IPV6
+ bool "MPTCP: IPv6 support for Multipath TCP"
+ depends on MPTCP
+ select IPV6
+ default y
+
+config MPTCP_HMAC_TEST
+ bool "Tests for MPTCP HMAC implementation"
+ default n
+ help
+ This option enable boot time self-test for the HMAC implementation
+ used by the MPTCP code
+
+ Say N if you are unsure.
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
new file mode 100644
index 000000000000..4e98d9edfd0a
--- /dev/null
+++ b/net/mptcp/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MPTCP) += mptcp.o
+
+mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
new file mode 100644
index 000000000000..40d1bb18fd60
--- /dev/null
+++ b/net/mptcp/crypto.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP cryptographic functions
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ *
+ * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and
+ * mptcp_ipv6 from multipath-tcp.org, authored by:
+ *
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ * Gregory Detal <gregory.detal@uclouvain.be>
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
+ * Andreas Ripke <ripke@neclab.eu>
+ * Vlad Dogaru <vlad.dogaru@intel.com>
+ * Octavian Purdila <octavian.purdila@intel.com>
+ * John Ronan <jronan@tssg.org>
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
+ * Brandon Heller <brandonh@stanford.edu>
+ */
+
+#include <linux/kernel.h>
+#include <crypto/sha.h>
+#include <asm/unaligned.h>
+
+#include "protocol.h"
+
+#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4)
+
+void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
+{
+ __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
+ __be64 input = cpu_to_be64(key);
+ struct sha256_state state;
+
+ sha256_init(&state);
+ sha256_update(&state, (__force u8 *)&input, sizeof(input));
+ sha256_final(&state, (u8 *)mptcp_hashed_key);
+
+ if (token)
+ *token = be32_to_cpu(mptcp_hashed_key[0]);
+ if (idsn)
+ *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
+}
+
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hmac)
+{
+ u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
+ __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
+ __be32 *hash_out = (__force __be32 *)hmac;
+ struct sha256_state state;
+ u8 key1be[8];
+ u8 key2be[8];
+ int i;
+
+ put_unaligned_be64(key1, key1be);
+ put_unaligned_be64(key2, key2be);
+
+ /* Generate key xored with ipad */
+ memset(input, 0x36, SHA_MESSAGE_BYTES);
+ for (i = 0; i < 8; i++)
+ input[i] ^= key1be[i];
+ for (i = 0; i < 8; i++)
+ input[i + 8] ^= key2be[i];
+
+ put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]);
+ put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
+
+ sha256_init(&state);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + 8);
+
+ /* emit sha256(K1 || msg) on the second input block, so we can
+ * reuse 'input' for the last hashing
+ */
+ sha256_final(&state, &input[SHA256_BLOCK_SIZE]);
+
+ /* Prepare second part of hmac */
+ memset(input, 0x5C, SHA_MESSAGE_BYTES);
+ for (i = 0; i < 8; i++)
+ input[i] ^= key1be[i];
+ for (i = 0; i < 8; i++)
+ input[i + 8] ^= key2be[i];
+
+ sha256_init(&state);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE);
+ sha256_final(&state, (u8 *)mptcp_hashed_key);
+
+ /* takes only first 160 bits */
+ for (i = 0; i < 5; i++)
+ hash_out[i] = mptcp_hashed_key[i];
+}
+
+#ifdef CONFIG_MPTCP_HMAC_TEST
+struct test_cast {
+ char *key;
+ char *msg;
+ char *result;
+};
+
+/* we can't reuse RFC 4231 test vectors, as we have constraint on the
+ * input and key size, and we truncate the output.
+ */
+static struct test_cast tests[] = {
+ {
+ .key = "0b0b0b0b0b0b0b0b",
+ .msg = "48692054",
+ .result = "8385e24fb4235ac37556b6b886db106284a1da67",
+ },
+ {
+ .key = "aaaaaaaaaaaaaaaa",
+ .msg = "dddddddd",
+ .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492",
+ },
+ {
+ .key = "0102030405060708",
+ .msg = "cdcdcdcd",
+ .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6",
+ },
+};
+
+static int __init test_mptcp_crypto(void)
+{
+ char hmac[20], hmac_hex[41];
+ u32 nonce1, nonce2;
+ u64 key1, key2;
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ /* mptcp hmap will convert to be before computing the hmac */
+ key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
+ key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
+ nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
+ nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
+
+ mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac);
+ for (j = 0; j < 20; ++j)
+ sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
+ hmac_hex[40] = 0;
+
+ if (memcmp(hmac_hex, tests[i].result, 40))
+ pr_err("test %d failed, got %s expected %s", i,
+ hmac_hex, tests[i].result);
+ else
+ pr_info("test %d [ ok ]", i);
+ }
+ return 0;
+}
+
+late_initcall(test_mptcp_crypto);
+#endif
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
new file mode 100644
index 000000000000..8e39585d37f3
--- /dev/null
+++ b/net/mptcp/ctrl.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Tessares SA.
+ */
+
+#include <linux/sysctl.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "protocol.h"
+
+#define MPTCP_SYSCTL_PATH "net/mptcp"
+
+static int mptcp_pernet_id;
+struct mptcp_pernet {
+ struct ctl_table_header *ctl_table_hdr;
+
+ int mptcp_enabled;
+};
+
+static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
+{
+ return net_generic(net, mptcp_pernet_id);
+}
+
+int mptcp_is_enabled(struct net *net)
+{
+ return mptcp_get_pernet(net)->mptcp_enabled;
+}
+
+static struct ctl_table mptcp_sysctl_table[] = {
+ {
+ .procname = "enabled",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* users with CAP_NET_ADMIN or root (not and) can change this
+ * value, same as other sysctl or the 'net' tree.
+ */
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
+{
+ pernet->mptcp_enabled = 1;
+}
+
+static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
+{
+ struct ctl_table_header *hdr;
+ struct ctl_table *table;
+
+ table = mptcp_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ }
+
+ table[0].data = &pernet->mptcp_enabled;
+
+ hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
+ if (!hdr)
+ goto err_reg;
+
+ pernet->ctl_table_hdr = hdr;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
+{
+ struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(pernet->ctl_table_hdr);
+
+ kfree(table);
+}
+
+static int __net_init mptcp_net_init(struct net *net)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(net);
+
+ mptcp_pernet_set_defaults(pernet);
+
+ return mptcp_pernet_new_table(net, pernet);
+}
+
+/* Note: the callback will only be called per extra netns */
+static void __net_exit mptcp_net_exit(struct net *net)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(net);
+
+ mptcp_pernet_del_table(pernet);
+}
+
+static struct pernet_operations mptcp_pernet_ops = {
+ .init = mptcp_net_init,
+ .exit = mptcp_net_exit,
+ .id = &mptcp_pernet_id,
+ .size = sizeof(struct mptcp_pernet),
+};
+
+void __init mptcp_init(void)
+{
+ mptcp_proto_init();
+
+ if (register_pernet_subsys(&mptcp_pernet_ops) < 0)
+ panic("Failed to register MPTCP pernet subsystem.\n");
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+int __init mptcpv6_init(void)
+{
+ int err;
+
+ err = mptcp_proto_v6_init();
+
+ return err;
+}
+#endif
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
new file mode 100644
index 000000000000..45acd877bef3
--- /dev/null
+++ b/net/mptcp/options.c
@@ -0,0 +1,586 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static bool mptcp_cap_flag_sha256(u8 flags)
+{
+ return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
+}
+
+void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
+ int opsize, struct tcp_options_received *opt_rx)
+{
+ struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
+ u8 subtype = *ptr >> 4;
+ int expected_opsize;
+ u8 version;
+ u8 flags;
+
+ switch (subtype) {
+ case MPTCPOPT_MP_CAPABLE:
+ /* strict size checking */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ if (skb->len > tcp_hdr(skb)->doff << 2)
+ expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ else
+ expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
+ } else {
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
+ expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
+ else
+ expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
+ }
+ if (opsize != expected_opsize)
+ break;
+
+ /* try to be gentle vs future versions on the initial syn */
+ version = *ptr++ & MPTCP_VERSION_MASK;
+ if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
+ if (version != MPTCP_SUPPORTED_VERSION)
+ break;
+ } else if (version < MPTCP_SUPPORTED_VERSION) {
+ break;
+ }
+
+ flags = *ptr++;
+ if (!mptcp_cap_flag_sha256(flags) ||
+ (flags & MPTCP_CAP_EXTENSIBILITY))
+ break;
+
+ /* RFC 6824, Section 3.1:
+ * "For the Checksum Required bit (labeled "A"), if either
+ * host requires the use of checksums, checksums MUST be used.
+ * In other words, the only way for checksums not to be used
+ * is if both hosts in their SYNs set A=0."
+ *
+ * Section 3.3.0:
+ * "If a checksum is not present when its use has been
+ * negotiated, the receiver MUST close the subflow with a RST as
+ * it is considered broken."
+ *
+ * We don't implement DSS checksum - fall back to TCP.
+ */
+ if (flags & MPTCP_CAP_CHECKSUM_REQD)
+ break;
+
+ mp_opt->mp_capable = 1;
+ if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
+ mp_opt->sndr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
+ mp_opt->rcvr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
+ /* Section 3.1.:
+ * "the data parameters in a MP_CAPABLE are semantically
+ * equivalent to those in a DSS option and can be used
+ * interchangeably."
+ */
+ mp_opt->dss = 1;
+ mp_opt->use_map = 1;
+ mp_opt->mpc_map = 1;
+ mp_opt->data_len = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
+ version, flags, opsize, mp_opt->sndr_key,
+ mp_opt->rcvr_key, mp_opt->data_len);
+ break;
+
+ case MPTCPOPT_DSS:
+ pr_debug("DSS");
+ ptr++;
+
+ /* we must clear 'mpc_map' be able to detect MP_CAPABLE
+ * map vs DSS map in mptcp_incoming_options(), and reconstruct
+ * map info accordingly
+ */
+ mp_opt->mpc_map = 0;
+ flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
+ mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
+ mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
+ mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
+ mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
+ mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
+
+ pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
+ mp_opt->data_fin, mp_opt->dsn64,
+ mp_opt->use_map, mp_opt->ack64,
+ mp_opt->use_ack);
+
+ expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
+
+ if (mp_opt->use_ack) {
+ if (mp_opt->ack64)
+ expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
+ else
+ expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
+ }
+
+ if (mp_opt->use_map) {
+ if (mp_opt->dsn64)
+ expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
+ else
+ expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
+ }
+
+ /* RFC 6824, Section 3.3:
+ * If a checksum is present, but its use had
+ * not been negotiated in the MP_CAPABLE handshake,
+ * the checksum field MUST be ignored.
+ */
+ if (opsize != expected_opsize &&
+ opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
+ break;
+
+ mp_opt->dss = 1;
+
+ if (mp_opt->use_ack) {
+ if (mp_opt->ack64) {
+ mp_opt->data_ack = get_unaligned_be64(ptr);
+ ptr += 8;
+ } else {
+ mp_opt->data_ack = get_unaligned_be32(ptr);
+ ptr += 4;
+ }
+
+ pr_debug("data_ack=%llu", mp_opt->data_ack);
+ }
+
+ if (mp_opt->use_map) {
+ if (mp_opt->dsn64) {
+ mp_opt->data_seq = get_unaligned_be64(ptr);
+ ptr += 8;
+ } else {
+ mp_opt->data_seq = get_unaligned_be32(ptr);
+ ptr += 4;
+ }
+
+ mp_opt->subflow_seq = get_unaligned_be32(ptr);
+ ptr += 4;
+
+ mp_opt->data_len = get_unaligned_be16(ptr);
+ ptr += 2;
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
+ mp_opt->data_seq, mp_opt->subflow_seq,
+ mp_opt->data_len);
+ }
+
+ break;
+
+ default:
+ break;
+ }
+}
+
+void mptcp_get_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx)
+{
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+ ptr = (const unsigned char *)(th + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ if (opcode == TCPOPT_MPTCP)
+ mptcp_parse_option(skb, ptr, opsize, opt_rx);
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
+ unsigned int *size, struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ /* we will use snd_isn to detect first pkt [re]transmission
+ * in mptcp_established_options_mp()
+ */
+ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
+ if (subflow->request_mptcp) {
+ pr_debug("local_key=%llu", subflow->local_key);
+ opts->suboptions = OPTION_MPTCP_MPC_SYN;
+ opts->sndr_key = subflow->local_key;
+ *size = TCPOLEN_MPTCP_MPC_SYN;
+ return true;
+ }
+ return false;
+}
+
+void mptcp_rcv_synsent(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ pr_debug("subflow=%p", subflow);
+ if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
+ subflow->mp_capable = 1;
+ subflow->can_ack = 1;
+ subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
+ } else {
+ tcp_sk(sk)->is_mptcp = 0;
+ }
+}
+
+static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_ext *mpext;
+ unsigned int data_len;
+
+ pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow,
+ subflow->fourth_ack, subflow->snd_isn,
+ skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+
+ if (subflow->mp_capable && !subflow->fourth_ack && skb &&
+ subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
+ /* When skb is not available, we better over-estimate the
+ * emitted options len. A full DSS option is longer than
+ * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
+ * that.
+ */
+ mpext = mptcp_get_ext(skb);
+ data_len = mpext ? mpext->data_len : 0;
+
+ /* we will check ext_copy.data_len in mptcp_write_options() to
+ * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
+ * TCPOLEN_MPTCP_MPC_ACK
+ */
+ opts->ext_copy.data_len = data_len;
+ opts->suboptions = OPTION_MPTCP_MPC_ACK;
+ opts->sndr_key = subflow->local_key;
+ opts->rcvr_key = subflow->remote_key;
+
+ /* Section 3.1.
+ * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
+ * packets that start the first subflow of an MPTCP connection,
+ * as well as the first packet that carries data
+ */
+ if (data_len > 0)
+ *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
+ else
+ *size = TCPOLEN_MPTCP_MPC_ACK;
+
+ pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
+ subflow, subflow->local_key, subflow->remote_key,
+ data_len);
+
+ return true;
+ }
+ return false;
+}
+
+static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
+ struct mptcp_ext *ext)
+{
+ ext->data_fin = 1;
+
+ if (!ext->use_map) {
+ /* RFC6824 requires a DSS mapping with specific values
+ * if DATA_FIN is set but no data payload is mapped
+ */
+ ext->use_map = 1;
+ ext->dsn64 = 1;
+ ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
+ ext->subflow_seq = 0;
+ ext->data_len = 1;
+ } else {
+ /* If there's an existing DSS mapping, DATA_FIN consumes
+ * 1 additional byte of mapping space.
+ */
+ ext->data_len++;
+ }
+}
+
+static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ unsigned int dss_size = 0;
+ struct mptcp_ext *mpext;
+ struct mptcp_sock *msk;
+ unsigned int ack_size;
+ bool ret = false;
+ u8 tcp_fin;
+
+ if (skb) {
+ mpext = mptcp_get_ext(skb);
+ tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ } else {
+ mpext = NULL;
+ tcp_fin = 0;
+ }
+
+ if (!skb || (mpext && mpext->use_map) || tcp_fin) {
+ unsigned int map_size;
+
+ map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
+
+ remaining -= map_size;
+ dss_size = map_size;
+ if (mpext)
+ opts->ext_copy = *mpext;
+
+ if (skb && tcp_fin &&
+ subflow->conn->sk_state != TCP_ESTABLISHED)
+ mptcp_write_data_fin(subflow, &opts->ext_copy);
+ ret = true;
+ }
+
+ opts->ext_copy.use_ack = 0;
+ msk = mptcp_sk(subflow->conn);
+ if (!msk || !READ_ONCE(msk->can_ack)) {
+ *size = ALIGN(dss_size, 4);
+ return ret;
+ }
+
+ ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+
+ /* Add kind/length/subtype/flag overhead if mapping is not populated */
+ if (dss_size == 0)
+ ack_size += TCPOLEN_MPTCP_DSS_BASE;
+
+ dss_size += ack_size;
+
+ opts->ext_copy.data_ack = msk->ack_seq;
+ opts->ext_copy.ack64 = 1;
+ opts->ext_copy.use_ack = 1;
+
+ *size = ALIGN(dss_size, 4);
+ return true;
+}
+
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size, unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ unsigned int opt_size = 0;
+ bool ret = false;
+
+ if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
+ ret = true;
+ else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
+ opts))
+ ret = true;
+
+ /* we reserved enough space for the above options, and exceeding the
+ * TCP option space would be fatal
+ */
+ if (WARN_ON_ONCE(opt_size > remaining))
+ return false;
+
+ *size += opt_size;
+ remaining -= opt_size;
+
+ return ret;
+}
+
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ if (subflow_req->mp_capable) {
+ opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
+ opts->sndr_key = subflow_req->local_key;
+ *size = TCPOLEN_MPTCP_MPC_SYNACK;
+ pr_debug("subflow_req=%p, local_key=%llu",
+ subflow_req, subflow_req->local_key);
+ return true;
+ }
+ return false;
+}
+
+static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt)
+{
+ /* here we can process OoO, in-window pkts, only in-sequence 4th ack
+ * are relevant
+ */
+ if (likely(subflow->fourth_ack ||
+ TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
+ return true;
+
+ if (mp_opt->use_ack)
+ subflow->fourth_ack = 1;
+
+ if (subflow->can_ack)
+ return true;
+
+ /* If the first established packet does not contain MP_CAPABLE + data
+ * then fallback to TCP
+ */
+ if (!mp_opt->mp_capable) {
+ subflow->mp_capable = 0;
+ tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
+ return false;
+ }
+ subflow->remote_key = mp_opt->sndr_key;
+ subflow->can_ack = 1;
+ return true;
+}
+
+void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_options_received *opt_rx)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_options_received *mp_opt;
+ struct mptcp_ext *mpext;
+
+ mp_opt = &opt_rx->mptcp;
+ if (!check_fourth_ack(subflow, skb, mp_opt))
+ return;
+
+ if (!mp_opt->dss)
+ return;
+
+ mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
+ if (!mpext)
+ return;
+
+ memset(mpext, 0, sizeof(*mpext));
+
+ if (mp_opt->use_map) {
+ if (mp_opt->mpc_map) {
+ /* this is an MP_CAPABLE carrying MPTCP data
+ * we know this map the first chunk of data
+ */
+ mptcp_crypto_key_sha(subflow->remote_key, NULL,
+ &mpext->data_seq);
+ mpext->data_seq++;
+ mpext->subflow_seq = 1;
+ mpext->dsn64 = 1;
+ mpext->mpc_map = 1;
+ } else {
+ mpext->data_seq = mp_opt->data_seq;
+ mpext->subflow_seq = mp_opt->subflow_seq;
+ mpext->dsn64 = mp_opt->dsn64;
+ }
+ mpext->data_len = mp_opt->data_len;
+ mpext->use_map = 1;
+ }
+
+ if (mp_opt->use_ack) {
+ mpext->data_ack = mp_opt->data_ack;
+ mpext->use_ack = 1;
+ mpext->ack64 = mp_opt->ack64;
+ }
+
+ mpext->data_fin = mp_opt->data_fin;
+}
+
+void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
+{
+ if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
+ OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
+ u8 len;
+
+ if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
+ len = TCPOLEN_MPTCP_MPC_SYN;
+ else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
+ len = TCPOLEN_MPTCP_MPC_SYNACK;
+ else if (opts->ext_copy.data_len)
+ len = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ else
+ len = TCPOLEN_MPTCP_MPC_ACK;
+
+ *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
+ (MPTCPOPT_MP_CAPABLE << 12) |
+ (MPTCP_SUPPORTED_VERSION << 8) |
+ MPTCP_CAP_HMAC_SHA256);
+
+ if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
+ opts->suboptions))
+ goto mp_capable_done;
+
+ put_unaligned_be64(opts->sndr_key, ptr);
+ ptr += 2;
+ if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
+ goto mp_capable_done;
+
+ put_unaligned_be64(opts->rcvr_key, ptr);
+ ptr += 2;
+ if (!opts->ext_copy.data_len)
+ goto mp_capable_done;
+
+ put_unaligned_be32(opts->ext_copy.data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ ptr += 1;
+ }
+
+mp_capable_done:
+ if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
+ struct mptcp_ext *mpext = &opts->ext_copy;
+ u8 len = TCPOLEN_MPTCP_DSS_BASE;
+ u8 flags = 0;
+
+ if (mpext->use_ack) {
+ len += TCPOLEN_MPTCP_DSS_ACK64;
+ flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+ }
+
+ if (mpext->use_map) {
+ len += TCPOLEN_MPTCP_DSS_MAP64;
+
+ /* Use only 64-bit mapping flags for now, add
+ * support for optional 32-bit mappings later.
+ */
+ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
+ if (mpext->data_fin)
+ flags |= MPTCP_DSS_DATA_FIN;
+ }
+
+ *ptr++ = htonl((TCPOPT_MPTCP << 24) |
+ (len << 16) |
+ (MPTCPOPT_DSS << 12) |
+ (flags));
+
+ if (mpext->use_ack) {
+ put_unaligned_be64(mpext->data_ack, ptr);
+ ptr += 2;
+ }
+
+ if (mpext->use_map) {
+ put_unaligned_be64(mpext->data_seq, ptr);
+ ptr += 2;
+ put_unaligned_be32(mpext->subflow_seq, ptr);
+ ptr += 1;
+ put_unaligned_be32(mpext->data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ }
+ }
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
new file mode 100644
index 000000000000..39fdca79ce90
--- /dev/null
+++ b/net/mptcp/protocol.c
@@ -0,0 +1,1276 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/sched/signal.h>
+#include <linux/atomic.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/transp_v6.h>
+#endif
+#include <net/mptcp.h>
+#include "protocol.h"
+
+#define MPTCP_SAME_STATE TCP_MAX_STATES
+
+static void __mptcp_close(struct sock *sk, long timeout);
+
+static const struct proto_ops *tcp_proto_ops(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return &inet6_stream_ops;
+#endif
+ return &inet_stream_ops;
+}
+
+/* MP_CAPABLE handshake failed, convert msk to plain tcp, replacing
+ * socket->sk and stream ops and destroying msk
+ * return the msk socket, as we can't access msk anymore after this function
+ * completes
+ * Called with msk lock held, releases such lock before returning
+ */
+static struct socket *__mptcp_fallback_to_tcp(struct mptcp_sock *msk,
+ struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct socket *sock;
+ struct sock *sk;
+
+ sk = (struct sock *)msk;
+ sock = sk->sk_socket;
+ subflow = mptcp_subflow_ctx(ssk);
+
+ /* detach the msk socket */
+ list_del_init(&subflow->node);
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ /* socket is now TCP */
+ lock_sock(ssk);
+ sock_graft(ssk, sock);
+ if (subflow->conn) {
+ /* We can't release the ULP data on a live socket,
+ * restore the tcp callback
+ */
+ mptcp_subflow_tcp_fallback(ssk, subflow);
+ sock_put(subflow->conn);
+ subflow->conn = NULL;
+ }
+ release_sock(ssk);
+ sock->ops = tcp_proto_ops(ssk);
+
+ /* destroy the left-over msk sock */
+ __mptcp_close(sk, 0);
+ return sock;
+}
+
+/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
+ * completed yet or has failed, return the subflow socket.
+ * Otherwise return NULL.
+ */
+static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
+{
+ if (!msk->subflow || READ_ONCE(msk->can_ack))
+ return NULL;
+
+ return msk->subflow;
+}
+
+static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
+{
+ return msk->first && !sk_is_mptcp(msk->first);
+}
+
+/* if the mp_capable handshake has failed, it fallbacks msk to plain TCP,
+ * releases the socket lock and returns a reference to the now TCP socket.
+ * Otherwise returns NULL
+ */
+static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
+{
+ sock_owned_by_me((const struct sock *)msk);
+
+ if (likely(!__mptcp_needs_tcp_fallback(msk)))
+ return NULL;
+
+ if (msk->subflow) {
+ /* the first subflow is an active connection, discart the
+ * paired socket
+ */
+ msk->subflow->sk = NULL;
+ sock_release(msk->subflow);
+ msk->subflow = NULL;
+ }
+
+ return __mptcp_fallback_to_tcp(msk, msk->first);
+}
+
+static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
+{
+ return !msk->first;
+}
+
+static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct socket *ssock;
+ int err;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (ssock)
+ goto set_state;
+
+ if (!__mptcp_can_create_subflow(msk))
+ return ERR_PTR(-EINVAL);
+
+ err = mptcp_subflow_create_socket(sk, &ssock);
+ if (err)
+ return ERR_PTR(err);
+
+ msk->first = ssock->sk;
+ msk->subflow = ssock;
+ subflow = mptcp_subflow_ctx(ssock->sk);
+ list_add(&subflow->node, &msk->conn_list);
+ subflow->request_mptcp = 1;
+
+set_state:
+ if (state != MPTCP_SAME_STATE)
+ inet_sk_state_store(sk, state);
+ return ssock;
+}
+
+static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ return mptcp_subflow_tcp_sock(subflow);
+ }
+
+ return NULL;
+}
+
+static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
+{
+ if (!msk->cached_ext)
+ msk->cached_ext = __skb_ext_alloc();
+
+ return !!msk->cached_ext;
+}
+
+static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+
+ sock_owned_by_me(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (subflow->data_avail)
+ return mptcp_subflow_tcp_sock(subflow);
+ }
+
+ return NULL;
+}
+
+static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
+ const struct sk_buff *skb,
+ const struct mptcp_ext *mpext)
+{
+ if (!tcp_skb_can_collapse_to(skb))
+ return false;
+
+ /* can collapse only if MPTCP level sequence is in order */
+ return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
+}
+
+static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
+ struct msghdr *msg, long *timeo, int *pmss_now,
+ int *ps_goal)
+{
+ int mss_now, avail_size, size_goal, ret;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_ext *mpext = NULL;
+ struct sk_buff *skb, *tail;
+ bool can_collapse = false;
+ struct page_frag *pfrag;
+ size_t psize;
+
+ /* use the mptcp page cache so that we can easily move the data
+ * from one substream to another, but do per subflow memory accounting
+ */
+ pfrag = sk_page_frag(sk);
+ while (!sk_page_frag_refill(ssk, pfrag) ||
+ !mptcp_ext_cache_refill(msk)) {
+ ret = sk_stream_wait_memory(ssk, timeo);
+ if (ret)
+ return ret;
+ if (unlikely(__mptcp_needs_tcp_fallback(msk)))
+ return 0;
+ }
+
+ /* compute copy limit */
+ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
+ *pmss_now = mss_now;
+ *ps_goal = size_goal;
+ avail_size = size_goal;
+ skb = tcp_write_queue_tail(ssk);
+ if (skb) {
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+
+ /* Limit the write to the size available in the
+ * current skb, if any, so that we create at most a new skb.
+ * Explicitly tells TCP internals to avoid collapsing on later
+ * queue management operation, to avoid breaking the ext <->
+ * SSN association set here
+ */
+ can_collapse = (size_goal - skb->len > 0) &&
+ mptcp_skb_can_collapse_to(msk, skb, mpext);
+ if (!can_collapse)
+ TCP_SKB_CB(skb)->eor = 1;
+ else
+ avail_size = size_goal - skb->len;
+ }
+ psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
+
+ /* Copy to page */
+ pr_debug("left=%zu", msg_data_left(msg));
+ psize = copy_page_from_iter(pfrag->page, pfrag->offset,
+ min_t(size_t, msg_data_left(msg), psize),
+ &msg->msg_iter);
+ pr_debug("left=%zu", msg_data_left(msg));
+ if (!psize)
+ return -EINVAL;
+
+ /* tell the TCP stack to delay the push so that we can safely
+ * access the skb after the sendpages call
+ */
+ ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+ msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+ if (ret <= 0)
+ return ret;
+ if (unlikely(ret < psize))
+ iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ /* if the tail skb extension is still the cached one, collapsing
+ * really happened. Note: we can't check for 'same skb' as the sk_buff
+ * hdr on tail can be transmitted, freed and re-allocated by the
+ * do_tcp_sendpages() call
+ */
+ tail = tcp_write_queue_tail(ssk);
+ if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
+ WARN_ON_ONCE(!can_collapse);
+ mpext->data_len += ret;
+ goto out;
+ }
+
+ skb = tcp_write_queue_tail(ssk);
+ mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
+ msk->cached_ext = NULL;
+
+ memset(mpext, 0, sizeof(*mpext));
+ mpext->data_seq = msk->write_seq;
+ mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
+ mpext->data_len = ret;
+ mpext->use_map = 1;
+ mpext->dsn64 = 1;
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
+ mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+ mpext->dsn64);
+
+out:
+ pfrag->offset += ret;
+ msk->write_seq += ret;
+ mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
+
+ return ret;
+}
+
+static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct socket *sock;
+
+ if (likely(sk_stream_is_writeable(ssk)))
+ return;
+
+ sock = READ_ONCE(ssk->sk_socket);
+
+ if (sock) {
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic();
+ /* set NOSPACE only after clearing SEND_SPACE flag */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
+}
+
+static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ int mss_now = 0, size_goal = 0, ret = 0;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ size_t copied = 0;
+ struct sock *ssk;
+ long timeo;
+
+ if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock)) {
+fallback:
+ pr_debug("fallback passthrough");
+ ret = sock_sendmsg(ssock, msg);
+ return ret >= 0 ? ret + copied : (copied ? copied : ret);
+ }
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ ssk = mptcp_subflow_get(msk);
+ if (!ssk) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ pr_debug("conn_list->subflow=%p", ssk);
+
+ lock_sock(ssk);
+ while (msg_data_left(msg)) {
+ ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
+ &size_goal);
+ if (ret < 0)
+ break;
+ if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
+ release_sock(ssk);
+ ssock = __mptcp_tcp_fallback(msk);
+ goto fallback;
+ }
+
+ copied += ret;
+ }
+
+ if (copied) {
+ ret = copied;
+ tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
+ size_goal);
+ }
+
+ ssk_check_wmem(msk, ssk);
+ release_sock(ssk);
+ release_sock(sk);
+ return ret;
+}
+
+int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct mptcp_read_arg *arg = desc->arg.data;
+ size_t copy_len;
+
+ copy_len = min(desc->count, len);
+
+ if (likely(arg->msg)) {
+ int err;
+
+ err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len);
+ if (err) {
+ pr_debug("error path");
+ desc->error = err;
+ return err;
+ }
+ } else {
+ pr_debug("Flushing skb payload");
+ }
+
+ desc->count -= copy_len;
+
+ pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count);
+ return copy_len;
+}
+
+static void mptcp_wait_data(struct sock *sk, long *timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+ sk_wait_event(sk, timeo,
+ test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
+
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ bool more_data_avail = false;
+ struct mptcp_read_arg arg;
+ read_descriptor_t desc;
+ bool wait_data = false;
+ struct socket *ssock;
+ struct tcp_sock *tp;
+ bool done = false;
+ struct sock *ssk;
+ int copied = 0;
+ int target;
+ long timeo;
+
+ if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock)) {
+fallback:
+ pr_debug("fallback-read subflow=%p",
+ mptcp_subflow_ctx(ssock->sk));
+ copied = sock_recvmsg(ssock, msg, flags);
+ return copied;
+ }
+
+ arg.msg = msg;
+ desc.arg.data = &arg;
+ desc.error = 0;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ len = min_t(size_t, len, INT_MAX);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ while (!done) {
+ u32 map_remaining;
+ int bytes_read;
+
+ ssk = mptcp_subflow_recv_lookup(msk);
+ pr_debug("msk=%p ssk=%p", msk, ssk);
+ if (!ssk)
+ goto wait_for_data;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ tp = tcp_sk(ssk);
+
+ lock_sock(ssk);
+ do {
+ /* try to read as much data as available */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+ desc.count = min_t(size_t, len - copied, map_remaining);
+ pr_debug("reading %zu bytes, copied %d", desc.count,
+ copied);
+ bytes_read = tcp_read_sock(ssk, &desc,
+ mptcp_read_actor);
+ if (bytes_read < 0) {
+ if (!copied)
+ copied = bytes_read;
+ done = true;
+ goto next;
+ }
+
+ pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
+ msk->ack_seq + bytes_read);
+ msk->ack_seq += bytes_read;
+ copied += bytes_read;
+ if (copied >= len) {
+ done = true;
+ goto next;
+ }
+ if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
+ pr_err("Urgent data present, cannot proceed");
+ done = true;
+ goto next;
+ }
+next:
+ more_data_avail = mptcp_subflow_data_available(ssk);
+ } while (more_data_avail && !done);
+ release_sock(ssk);
+ continue;
+
+wait_for_data:
+ more_data_avail = false;
+
+ /* only the master socket status is relevant here. The exit
+ * conditions mirror closely tcp_recvmsg()
+ */
+ if (copied >= target)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ copied = -ENOTCONN;
+ break;
+ }
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ pr_debug("block timeout %ld", timeo);
+ wait_data = true;
+ mptcp_wait_data(sk, &timeo);
+ if (unlikely(__mptcp_tcp_fallback(msk)))
+ goto fallback;
+ }
+
+ if (more_data_avail) {
+ if (!test_bit(MPTCP_DATA_READY, &msk->flags))
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ } else if (!wait_data) {
+ clear_bit(MPTCP_DATA_READY, &msk->flags);
+
+ /* .. race-breaker: ssk might get new data after last
+ * data_available() returns false.
+ */
+ ssk = mptcp_subflow_recv_lookup(msk);
+ if (unlikely(ssk))
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ }
+
+ release_sock(sk);
+ return copied;
+}
+
+/* subflow sockets can be either outgoing (connect) or incoming
+ * (accept).
+ *
+ * Outgoing subflows use in-kernel sockets.
+ * Incoming subflows do not have their own 'struct socket' allocated,
+ * so we need to use tcp_close() after detaching them from the mptcp
+ * parent socket.
+ */
+static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ long timeout)
+{
+ struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+ list_del(&subflow->node);
+
+ if (sock && sock != sk->sk_socket) {
+ /* outgoing subflow */
+ sock_release(sock);
+ } else {
+ /* incoming subflow */
+ tcp_close(ssk, timeout);
+ }
+}
+
+static int __mptcp_init_sock(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ INIT_LIST_HEAD(&msk->conn_list);
+ __set_bit(MPTCP_SEND_SPACE, &msk->flags);
+
+ msk->first = NULL;
+
+ return 0;
+}
+
+static int mptcp_init_sock(struct sock *sk)
+{
+ if (!mptcp_is_enabled(sock_net(sk)))
+ return -ENOPROTOOPT;
+
+ return __mptcp_init_sock(sk);
+}
+
+static void mptcp_subflow_shutdown(struct sock *ssk, int how)
+{
+ lock_sock(ssk);
+
+ switch (ssk->sk_state) {
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* fall through */
+ case TCP_SYN_SENT:
+ tcp_disconnect(ssk, O_NONBLOCK);
+ break;
+ default:
+ ssk->sk_shutdown |= how;
+ tcp_shutdown(ssk, how);
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ ssk->sk_state_change(ssk);
+ release_sock(ssk);
+}
+
+/* Called with msk lock held, releases such lock before returning */
+static void __mptcp_close(struct sock *sk, long timeout)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_token_destroy(msk->token);
+ inet_sk_state_store(sk, TCP_CLOSE);
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ }
+
+ if (msk->cached_ext)
+ __skb_ext_put(msk->cached_ext);
+ release_sock(sk);
+ sk_common_release(sk);
+}
+
+static void mptcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __mptcp_close(sk, timeout);
+}
+
+static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
+ struct ipv6_pinfo *msk6 = inet6_sk(msk);
+
+ msk->sk_v6_daddr = ssk->sk_v6_daddr;
+ msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
+
+ if (msk6 && ssk6) {
+ msk6->saddr = ssk6->saddr;
+ msk6->flow_label = ssk6->flow_label;
+ }
+#endif
+
+ inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
+ inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
+ inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
+ inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
+ inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
+ inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
+}
+
+static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
+ bool kern)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *listener;
+ struct sock *newsk;
+
+ listener = __mptcp_nmpc_socket(msk);
+ if (WARN_ON_ONCE(!listener)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
+ newsk = inet_csk_accept(listener->sk, flags, err, kern);
+ if (!newsk)
+ return NULL;
+
+ pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
+
+ if (sk_is_mptcp(newsk)) {
+ struct mptcp_subflow_context *subflow;
+ struct sock *new_mptcp_sock;
+ struct sock *ssk = newsk;
+ u64 ack_seq;
+
+ subflow = mptcp_subflow_ctx(newsk);
+ lock_sock(sk);
+
+ local_bh_disable();
+ new_mptcp_sock = sk_clone_lock(sk, GFP_ATOMIC);
+ if (!new_mptcp_sock) {
+ *err = -ENOBUFS;
+ local_bh_enable();
+ release_sock(sk);
+ mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1);
+ tcp_close(newsk, 0);
+ return NULL;
+ }
+
+ __mptcp_init_sock(new_mptcp_sock);
+
+ msk = mptcp_sk(new_mptcp_sock);
+ msk->local_key = subflow->local_key;
+ msk->token = subflow->token;
+ msk->subflow = NULL;
+ msk->first = newsk;
+
+ mptcp_token_update_accept(newsk, new_mptcp_sock);
+
+ msk->write_seq = subflow->idsn + 1;
+ if (subflow->can_ack) {
+ msk->can_ack = true;
+ msk->remote_key = subflow->remote_key;
+ mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ msk->ack_seq = ack_seq;
+ }
+ newsk = new_mptcp_sock;
+ mptcp_copy_inaddrs(newsk, ssk);
+ list_add(&subflow->node, &msk->conn_list);
+
+ /* will be fully established at mptcp_stream_accept()
+ * completion.
+ */
+ inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV);
+ bh_unlock_sock(new_mptcp_sock);
+ local_bh_enable();
+ release_sock(sk);
+
+ /* the subflow can already receive packet, avoid racing with
+ * the receive path and process the pending ones
+ */
+ lock_sock(ssk);
+ subflow->rel_write_seq = 1;
+ subflow->tcp_sock = ssk;
+ subflow->conn = new_mptcp_sock;
+ if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue)))
+ mptcp_subflow_data_available(ssk);
+ release_sock(ssk);
+ }
+
+ return newsk;
+}
+
+static void mptcp_destroy(struct sock *sk)
+{
+}
+
+static int mptcp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *uoptval, unsigned int optlen)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ char __kernel *optval;
+ int ret = -EOPNOTSUPP;
+ struct socket *ssock;
+
+ /* will be treated as __user in tcp_setsockopt */
+ optval = (char __kernel __force *)uoptval;
+
+ pr_debug("msk=%p", msk);
+
+ /* @@ the meaning of setsockopt() when the socket is connected and
+ * there are multiple subflows is not defined.
+ */
+ lock_sock(sk);
+ ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
+ if (!IS_ERR(ssock)) {
+ pr_debug("subflow=%p", ssock->sk);
+ ret = kernel_setsockopt(ssock, level, optname, optval, optlen);
+ }
+ release_sock(sk);
+
+ return ret;
+}
+
+static int mptcp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *uoptval, int __user *uoption)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ char __kernel *optval;
+ int ret = -EOPNOTSUPP;
+ int __kernel *option;
+ struct socket *ssock;
+
+ /* will be treated as __user in tcp_getsockopt */
+ optval = (char __kernel __force *)uoptval;
+ option = (int __kernel __force *)uoption;
+
+ pr_debug("msk=%p", msk);
+
+ /* @@ the meaning of getsockopt() when the socket is connected and
+ * there are multiple subflows is not defined.
+ */
+ lock_sock(sk);
+ ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
+ if (!IS_ERR(ssock)) {
+ pr_debug("subflow=%p", ssock->sk);
+ ret = kernel_getsockopt(ssock, level, optname, optval, option);
+ }
+ release_sock(sk);
+
+ return ret;
+}
+
+static int mptcp_get_port(struct sock *sk, unsigned short snum)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ pr_debug("msk=%p, subflow=%p", msk, ssock);
+ if (WARN_ON_ONCE(!ssock))
+ return -EINVAL;
+
+ return inet_csk_get_port(ssock->sk, snum);
+}
+
+void mptcp_finish_connect(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+ u64 ack_seq;
+
+ subflow = mptcp_subflow_ctx(ssk);
+
+ if (!subflow->mp_capable)
+ return;
+
+ sk = subflow->conn;
+ msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p, token=%u", sk, subflow->token);
+
+ mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ subflow->map_seq = ack_seq;
+ subflow->map_subflow_seq = 1;
+ subflow->rel_write_seq = 1;
+
+ /* the socket is not connected yet, no msk/subflow ops can access/race
+ * accessing the field below
+ */
+ WRITE_ONCE(msk->remote_key, subflow->remote_key);
+ WRITE_ONCE(msk->local_key, subflow->local_key);
+ WRITE_ONCE(msk->token, subflow->token);
+ WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+ WRITE_ONCE(msk->ack_seq, ack_seq);
+ WRITE_ONCE(msk->can_ack, 1);
+}
+
+static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ rcu_assign_pointer(sk->sk_wq, &parent->wq);
+ sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static bool mptcp_memory_free(const struct sock *sk, int wake)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
+}
+
+static struct proto mptcp_prot = {
+ .name = "MPTCP",
+ .owner = THIS_MODULE,
+ .init = mptcp_init_sock,
+ .close = mptcp_close,
+ .accept = mptcp_accept,
+ .setsockopt = mptcp_setsockopt,
+ .getsockopt = mptcp_getsockopt,
+ .shutdown = tcp_shutdown,
+ .destroy = mptcp_destroy,
+ .sendmsg = mptcp_sendmsg,
+ .recvmsg = mptcp_recvmsg,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = mptcp_get_port,
+ .stream_memory_free = mptcp_memory_free,
+ .obj_size = sizeof(struct mptcp_sock),
+ .no_autobind = true,
+};
+
+static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ lock_sock(sock->sk);
+ ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
+ goto unlock;
+ }
+
+ err = ssock->ops->bind(ssock, uaddr, addr_len);
+ if (!err)
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+
+unlock:
+ release_sock(sock->sk);
+ return err;
+}
+
+static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ lock_sock(sock->sk);
+ ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
+ goto unlock;
+ }
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
+ mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
+#endif
+
+ err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
+ inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+
+unlock:
+ release_sock(sock->sk);
+ return err;
+}
+
+static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ if (sock->sk->sk_prot == &tcp_prot) {
+ /* we are being invoked from __sys_accept4, after
+ * mptcp_accept() has just accepted a non-mp-capable
+ * flow: sk is a tcp_sk, not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ sock->ops = &inet_stream_ops;
+ }
+
+ return inet_getname(sock, uaddr, peer);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ if (sock->sk->sk_prot == &tcpv6_prot) {
+ /* we are being invoked from __sys_accept4 after
+ * mptcp_accept() has accepted a non-mp-capable
+ * subflow: sk is a tcp_sk, not mptcp.
+ *
+ * Hand the socket over to tcp so all further
+ * socket ops bypass mptcp.
+ */
+ sock->ops = &inet6_stream_ops;
+ }
+
+ return inet6_getname(sock, uaddr, peer);
+}
+#endif
+
+static int mptcp_listen(struct socket *sock, int backlog)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ pr_debug("msk=%p", msk);
+
+ lock_sock(sock->sk);
+ ssock = __mptcp_socket_create(msk, TCP_LISTEN);
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
+ goto unlock;
+ }
+
+ err = ssock->ops->listen(ssock, backlog);
+ inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
+ if (!err)
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+
+unlock:
+ release_sock(sock->sk);
+ return err;
+}
+
+static bool is_tcp_proto(const struct proto *p)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ return p == &tcp_prot || p == &tcpv6_prot;
+#else
+ return p == &tcp_prot;
+#endif
+}
+
+static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
+ int flags, bool kern)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ pr_debug("msk=%p", msk);
+
+ lock_sock(sock->sk);
+ if (sock->sk->sk_state != TCP_LISTEN)
+ goto unlock_fail;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock)
+ goto unlock_fail;
+
+ sock_hold(ssock->sk);
+ release_sock(sock->sk);
+
+ err = ssock->ops->accept(sock, newsock, flags, kern);
+ if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) {
+ struct mptcp_sock *msk = mptcp_sk(newsock->sk);
+ struct mptcp_subflow_context *subflow;
+
+ /* set ssk->sk_socket of accept()ed flows to mptcp socket.
+ * This is needed so NOSPACE flag can be set from tcp stack.
+ */
+ list_for_each_entry(subflow, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!ssk->sk_socket)
+ mptcp_sock_graft(ssk, newsock);
+ }
+
+ inet_sk_state_store(newsock->sk, TCP_ESTABLISHED);
+ }
+
+ sock_put(ssock->sk);
+ return err;
+
+unlock_fail:
+ release_sock(sock->sk);
+ return -EINVAL;
+}
+
+static __poll_t mptcp_poll(struct file *file, struct socket *sock,
+ struct poll_table_struct *wait)
+{
+ struct sock *sk = sock->sk;
+ struct mptcp_sock *msk;
+ struct socket *ssock;
+ __poll_t mask = 0;
+
+ msk = mptcp_sk(sk);
+ lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (ssock) {
+ mask = ssock->ops->poll(file, ssock, wait);
+ release_sock(sk);
+ return mask;
+ }
+
+ release_sock(sk);
+ sock_poll_wait(file, sock, wait);
+ lock_sock(sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
+ return ssock->ops->poll(file, ssock, NULL);
+
+ if (test_bit(MPTCP_DATA_READY, &msk->flags))
+ mask = EPOLLIN | EPOLLRDNORM;
+ if (sk_stream_is_writeable(sk) &&
+ test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+
+ release_sock(sk);
+
+ return mask;
+}
+
+static int mptcp_shutdown(struct socket *sock, int how)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ pr_debug("sk=%p, how=%d", msk, how);
+
+ lock_sock(sock->sk);
+
+ if (how == SHUT_WR || how == SHUT_RDWR)
+ inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
+
+ how++;
+
+ if ((how & ~SHUTDOWN_MASK) || !how) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sock->sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_subflow_shutdown(tcp_sk, how);
+ }
+
+out_unlock:
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static const struct proto_ops mptcp_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = mptcp_bind,
+ .connect = mptcp_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = mptcp_stream_accept,
+ .getname = mptcp_v4_getname,
+ .poll = mptcp_poll,
+ .ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = mptcp_listen,
+ .shutdown = mptcp_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw mptcp_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_MPTCP,
+ .prot = &mptcp_prot,
+ .ops = &mptcp_stream_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+void mptcp_proto_init(void)
+{
+ mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
+
+ mptcp_subflow_init();
+
+ if (proto_register(&mptcp_prot, 1) != 0)
+ panic("Failed to register MPTCP proto.\n");
+
+ inet_register_protosw(&mptcp_protosw);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static const struct proto_ops mptcp_v6_stream_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = mptcp_bind,
+ .connect = mptcp_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = mptcp_stream_accept,
+ .getname = mptcp_v6_getname,
+ .poll = mptcp_poll,
+ .ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = mptcp_listen,
+ .shutdown = mptcp_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet6_sendmsg,
+ .recvmsg = inet6_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct proto mptcp_v6_prot;
+
+static void mptcp_v6_destroy(struct sock *sk)
+{
+ mptcp_destroy(sk);
+ inet6_destroy_sock(sk);
+}
+
+static struct inet_protosw mptcp_v6_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_MPTCP,
+ .prot = &mptcp_v6_prot,
+ .ops = &mptcp_v6_stream_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+int mptcp_proto_v6_init(void)
+{
+ int err;
+
+ mptcp_v6_prot = mptcp_prot;
+ strcpy(mptcp_v6_prot.name, "MPTCPv6");
+ mptcp_v6_prot.slab = NULL;
+ mptcp_v6_prot.destroy = mptcp_v6_destroy;
+ mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) +
+ sizeof(struct ipv6_pinfo);
+
+ err = proto_register(&mptcp_v6_prot, 1);
+ if (err)
+ return err;
+
+ err = inet6_register_protosw(&mptcp_v6_protosw);
+ if (err)
+ proto_unregister(&mptcp_v6_prot);
+
+ return err;
+}
+#endif
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
new file mode 100644
index 000000000000..8a99a2930284
--- /dev/null
+++ b/net/mptcp/protocol.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#ifndef __MPTCP_PROTOCOL_H
+#define __MPTCP_PROTOCOL_H
+
+#include <linux/random.h>
+#include <net/tcp.h>
+#include <net/inet_connection_sock.h>
+
+#define MPTCP_SUPPORTED_VERSION 1
+
+/* MPTCP option bits */
+#define OPTION_MPTCP_MPC_SYN BIT(0)
+#define OPTION_MPTCP_MPC_SYNACK BIT(1)
+#define OPTION_MPTCP_MPC_ACK BIT(2)
+
+/* MPTCP option subtypes */
+#define MPTCPOPT_MP_CAPABLE 0
+#define MPTCPOPT_MP_JOIN 1
+#define MPTCPOPT_DSS 2
+#define MPTCPOPT_ADD_ADDR 3
+#define MPTCPOPT_RM_ADDR 4
+#define MPTCPOPT_MP_PRIO 5
+#define MPTCPOPT_MP_FAIL 6
+#define MPTCPOPT_MP_FASTCLOSE 7
+
+/* MPTCP suboption lengths */
+#define TCPOLEN_MPTCP_MPC_SYN 4
+#define TCPOLEN_MPTCP_MPC_SYNACK 12
+#define TCPOLEN_MPTCP_MPC_ACK 20
+#define TCPOLEN_MPTCP_MPC_ACK_DATA 22
+#define TCPOLEN_MPTCP_DSS_BASE 4
+#define TCPOLEN_MPTCP_DSS_ACK32 4
+#define TCPOLEN_MPTCP_DSS_ACK64 8
+#define TCPOLEN_MPTCP_DSS_MAP32 10
+#define TCPOLEN_MPTCP_DSS_MAP64 14
+#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
+
+/* MPTCP MP_CAPABLE flags */
+#define MPTCP_VERSION_MASK (0x0F)
+#define MPTCP_CAP_CHECKSUM_REQD BIT(7)
+#define MPTCP_CAP_EXTENSIBILITY BIT(6)
+#define MPTCP_CAP_HMAC_SHA256 BIT(0)
+#define MPTCP_CAP_FLAG_MASK (0x3F)
+
+/* MPTCP DSS flags */
+#define MPTCP_DSS_DATA_FIN BIT(4)
+#define MPTCP_DSS_DSN64 BIT(3)
+#define MPTCP_DSS_HAS_MAP BIT(2)
+#define MPTCP_DSS_ACK64 BIT(1)
+#define MPTCP_DSS_HAS_ACK BIT(0)
+#define MPTCP_DSS_FLAG_MASK (0x1F)
+
+/* MPTCP socket flags */
+#define MPTCP_DATA_READY BIT(0)
+#define MPTCP_SEND_SPACE BIT(1)
+
+/* MPTCP connection sock */
+struct mptcp_sock {
+ /* inet_connection_sock must be the first member */
+ struct inet_connection_sock sk;
+ u64 local_key;
+ u64 remote_key;
+ u64 write_seq;
+ u64 ack_seq;
+ u32 token;
+ unsigned long flags;
+ bool can_ack;
+ struct list_head conn_list;
+ struct skb_ext *cached_ext; /* for the next sendmsg */
+ struct socket *subflow; /* outgoing connect/listener/!mp_capable */
+ struct sock *first;
+};
+
+#define mptcp_for_each_subflow(__msk, __subflow) \
+ list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+
+static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
+{
+ return (struct mptcp_sock *)sk;
+}
+
+struct mptcp_subflow_request_sock {
+ struct tcp_request_sock sk;
+ u16 mp_capable : 1,
+ mp_join : 1,
+ backup : 1,
+ remote_key_valid : 1;
+ u64 local_key;
+ u64 remote_key;
+ u64 idsn;
+ u32 token;
+ u32 ssn_offset;
+};
+
+static inline struct mptcp_subflow_request_sock *
+mptcp_subflow_rsk(const struct request_sock *rsk)
+{
+ return (struct mptcp_subflow_request_sock *)rsk;
+}
+
+/* MPTCP subflow context */
+struct mptcp_subflow_context {
+ struct list_head node;/* conn_list of subflows */
+ u64 local_key;
+ u64 remote_key;
+ u64 idsn;
+ u64 map_seq;
+ u32 snd_isn;
+ u32 token;
+ u32 rel_write_seq;
+ u32 map_subflow_seq;
+ u32 ssn_offset;
+ u32 map_data_len;
+ u32 request_mptcp : 1, /* send MP_CAPABLE */
+ mp_capable : 1, /* remote is MPTCP capable */
+ fourth_ack : 1, /* send initial DSS */
+ conn_finished : 1,
+ map_valid : 1,
+ mpc_map : 1,
+ data_avail : 1,
+ rx_eof : 1,
+ can_ack : 1; /* only after processing the remote a key */
+
+ struct sock *tcp_sock; /* tcp sk backpointer */
+ struct sock *conn; /* parent mptcp_sock */
+ const struct inet_connection_sock_af_ops *icsk_af_ops;
+ void (*tcp_data_ready)(struct sock *sk);
+ void (*tcp_state_change)(struct sock *sk);
+ void (*tcp_write_space)(struct sock *sk);
+
+ struct rcu_head rcu;
+};
+
+static inline struct mptcp_subflow_context *
+mptcp_subflow_ctx(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* Use RCU on icsk_ulp_data only for sock diag code */
+ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
+}
+
+static inline struct sock *
+mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
+{
+ return subflow->tcp_sock;
+}
+
+static inline u64
+mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
+{
+ return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq -
+ subflow->ssn_offset -
+ subflow->map_subflow_seq;
+}
+
+static inline u64
+mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
+{
+ return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
+}
+
+int mptcp_is_enabled(struct net *net);
+bool mptcp_subflow_data_available(struct sock *sk);
+void mptcp_subflow_init(void);
+int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
+
+static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *ctx)
+{
+ sk->sk_data_ready = ctx->tcp_data_ready;
+ sk->sk_state_change = ctx->tcp_state_change;
+ sk->sk_write_space = ctx->tcp_write_space;
+
+ inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
+}
+
+extern const struct inet_connection_sock_af_ops ipv4_specific;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+extern const struct inet_connection_sock_af_ops ipv6_specific;
+#endif
+
+void mptcp_proto_init(void);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+int mptcp_proto_v6_init(void);
+#endif
+
+struct mptcp_read_arg {
+ struct msghdr *msg;
+};
+
+int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len);
+
+void mptcp_get_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx);
+
+void mptcp_finish_connect(struct sock *sk);
+
+int mptcp_token_new_request(struct request_sock *req);
+void mptcp_token_destroy_request(u32 token);
+int mptcp_token_new_connect(struct sock *sk);
+int mptcp_token_new_accept(u32 token);
+void mptcp_token_update_accept(struct sock *sk, struct sock *conn);
+void mptcp_token_destroy(u32 token);
+
+void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
+static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
+{
+ /* we might consider a faster version that computes the key as a
+ * hash of some information available in the MPTCP socket. Use
+ * random data at the moment, as it's probably the safest option
+ * in case multiple sockets are opened in different namespaces at
+ * the same time.
+ */
+ get_random_bytes(key, sizeof(u64));
+ mptcp_crypto_key_sha(*key, token, idsn);
+}
+
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hash_out);
+
+static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
+{
+ return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
+}
+
+static inline bool before64(__u64 seq1, __u64 seq2)
+{
+ return (__s64)(seq1 - seq2) < 0;
+}
+
+#define after64(seq2, seq1) before64(seq1, seq2)
+
+#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
new file mode 100644
index 000000000000..1662e1178949
--- /dev/null
+++ b/net/mptcp/subflow.c
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/ip6_route.h>
+#endif
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static int subflow_rebuild_header(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ int err = 0;
+
+ if (subflow->request_mptcp && !subflow->token) {
+ pr_debug("subflow=%p", sk);
+ err = mptcp_token_new_connect(sk);
+ }
+
+ if (err)
+ return err;
+
+ return subflow->icsk_af_ops->rebuild_header(sk);
+}
+
+static void subflow_req_destructor(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ pr_debug("subflow_req=%p", subflow_req);
+
+ if (subflow_req->mp_capable)
+ mptcp_token_destroy_request(subflow_req->token);
+ tcp_request_sock_ops.destructor(req);
+}
+
+static void subflow_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct tcp_options_received rx_opt;
+
+ pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+
+ memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
+ mptcp_get_options(skb, &rx_opt);
+
+ subflow_req->mp_capable = 0;
+ subflow_req->remote_key_valid = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ return;
+#endif
+
+ if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
+ int err;
+
+ err = mptcp_token_new_request(req);
+ if (err == 0)
+ subflow_req->mp_capable = 1;
+
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ }
+}
+
+static void subflow_v4_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ tcp_rsk(req)->is_mptcp = 1;
+
+ tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
+
+ subflow_init_req(req, sk_listener, skb);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static void subflow_v6_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ tcp_rsk(req)->is_mptcp = 1;
+
+ tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
+
+ subflow_init_req(req, sk_listener, skb);
+}
+#endif
+
+static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
+
+ if (subflow->conn && !subflow->conn_finished) {
+ pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
+ subflow->remote_key);
+ mptcp_finish_connect(sk);
+ subflow->conn_finished = 1;
+
+ if (skb) {
+ pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+ }
+ }
+}
+
+static struct request_sock_ops subflow_request_sock_ops;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
+
+static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow=%p", subflow);
+
+ /* Never answer to SYNs sent to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ return tcp_conn_request(&subflow_request_sock_ops,
+ &subflow_request_sock_ipv4_ops,
+ sk, skb);
+drop:
+ tcp_listendrop(sk);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
+static struct inet_connection_sock_af_ops subflow_v6_specific;
+static struct inet_connection_sock_af_ops subflow_v6m_specific;
+
+static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow=%p", subflow);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return subflow_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ return tcp_conn_request(&subflow_request_sock_ops,
+ &subflow_request_sock_ipv6_ops, sk, skb);
+
+drop:
+ tcp_listendrop(sk);
+ return 0; /* don't send reset */
+}
+#endif
+
+static struct sock *subflow_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_request_sock *subflow_req;
+ struct tcp_options_received opt_rx;
+ struct sock *child;
+
+ pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+
+ /* if the sk is MP_CAPABLE, we try to fetch the client key */
+ subflow_req = mptcp_subflow_rsk(req);
+ if (subflow_req->mp_capable) {
+ if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
+ /* here we can receive and accept an in-window,
+ * out-of-order pkt, which will not carry the MP_CAPABLE
+ * opt even on mptcp enabled paths
+ */
+ goto create_child;
+ }
+
+ opt_rx.mptcp.mp_capable = 0;
+ mptcp_get_options(skb, &opt_rx);
+ if (opt_rx.mptcp.mp_capable) {
+ subflow_req->remote_key = opt_rx.mptcp.sndr_key;
+ subflow_req->remote_key_valid = 1;
+ } else {
+ subflow_req->mp_capable = 0;
+ }
+ }
+
+create_child:
+ child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ req_unhash, own_req);
+
+ if (child && *own_req) {
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
+
+ /* we have null ctx on TCP fallback, not fatal on MPC
+ * handshake
+ */
+ if (!ctx)
+ return child;
+
+ if (ctx->mp_capable) {
+ if (mptcp_token_new_accept(ctx->token))
+ goto close_child;
+ }
+ }
+
+ return child;
+
+close_child:
+ pr_debug("closing child socket");
+ tcp_send_active_reset(child, GFP_ATOMIC);
+ inet_csk_prepare_forced_close(child);
+ tcp_done(child);
+ return NULL;
+}
+
+static struct inet_connection_sock_af_ops subflow_specific;
+
+enum mapping_status {
+ MAPPING_OK,
+ MAPPING_INVALID,
+ MAPPING_EMPTY,
+ MAPPING_DATA_FIN
+};
+
+static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
+{
+ if ((u32)seq == (u32)old_seq)
+ return old_seq;
+
+ /* Assume map covers data not mapped yet. */
+ return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
+}
+
+static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
+{
+ WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
+ ssn, subflow->map_subflow_seq, subflow->map_data_len);
+}
+
+static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned int skb_consumed;
+
+ skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
+ if (WARN_ON_ONCE(skb_consumed >= skb->len))
+ return true;
+
+ return skb->len - skb_consumed <= subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+}
+
+static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
+
+ if (unlikely(before(ssn, subflow->map_subflow_seq))) {
+ /* Mapping covers data later in the subflow stream,
+ * currently unsupported.
+ */
+ warn_bad_map(subflow, ssn);
+ return false;
+ }
+ if (unlikely(!before(ssn, subflow->map_subflow_seq +
+ subflow->map_data_len))) {
+ /* Mapping does covers past subflow data, invalid */
+ warn_bad_map(subflow, ssn + skb->len);
+ return false;
+ }
+ return true;
+}
+
+static enum mapping_status get_mapping_status(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_ext *mpext;
+ struct sk_buff *skb;
+ u16 data_len;
+ u64 map_seq;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ return MAPPING_EMPTY;
+
+ mpext = mptcp_get_ext(skb);
+ if (!mpext || !mpext->use_map) {
+ if (!subflow->map_valid && !skb->len) {
+ /* the TCP stack deliver 0 len FIN pkt to the receive
+ * queue, that is the only 0len pkts ever expected here,
+ * and we can admit no mapping only for 0 len pkts
+ */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ WARN_ONCE(1, "0len seq %d:%d flags %x",
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ TCP_SKB_CB(skb)->tcp_flags);
+ sk_eat_skb(ssk, skb);
+ return MAPPING_EMPTY;
+ }
+
+ if (!subflow->map_valid)
+ return MAPPING_INVALID;
+
+ goto validate_seq;
+ }
+
+ pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
+ mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
+ mpext->data_len, mpext->data_fin);
+
+ data_len = mpext->data_len;
+ if (data_len == 0) {
+ pr_err("Infinite mapping not handled");
+ return MAPPING_INVALID;
+ }
+
+ if (mpext->data_fin == 1) {
+ if (data_len == 1) {
+ pr_debug("DATA_FIN with no payload");
+ if (subflow->map_valid) {
+ /* A DATA_FIN might arrive in a DSS
+ * option before the previous mapping
+ * has been fully consumed. Continue
+ * handling the existing mapping.
+ */
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ } else {
+ return MAPPING_DATA_FIN;
+ }
+ }
+
+ /* Adjust for DATA_FIN using 1 byte of sequence space */
+ data_len--;
+ }
+
+ if (!mpext->dsn64) {
+ map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
+ mpext->data_seq);
+ pr_debug("expanded seq=%llu", subflow->map_seq);
+ } else {
+ map_seq = mpext->data_seq;
+ }
+
+ if (subflow->map_valid) {
+ /* Allow replacing only with an identical map */
+ if (subflow->map_seq == map_seq &&
+ subflow->map_subflow_seq == mpext->subflow_seq &&
+ subflow->map_data_len == data_len) {
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ }
+
+ /* If this skb data are fully covered by the current mapping,
+ * the new map would need caching, which is not supported
+ */
+ if (skb_is_fully_mapped(ssk, skb))
+ return MAPPING_INVALID;
+
+ /* will validate the next map after consuming the current one */
+ return MAPPING_OK;
+ }
+
+ subflow->map_seq = map_seq;
+ subflow->map_subflow_seq = mpext->subflow_seq;
+ subflow->map_data_len = data_len;
+ subflow->map_valid = 1;
+ subflow->mpc_map = mpext->mpc_map;
+ pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
+ subflow->map_seq, subflow->map_subflow_seq,
+ subflow->map_data_len);
+
+validate_seq:
+ /* we revalidate valid mapping on new skb, because we must ensure
+ * the current skb is completely covered by the available mapping
+ */
+ if (!validate_mapping(ssk, skb))
+ return MAPPING_INVALID;
+
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+}
+
+static bool subflow_check_data_avail(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ enum mapping_status status;
+ struct mptcp_sock *msk;
+ struct sk_buff *skb;
+
+ pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
+ subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (subflow->data_avail)
+ return true;
+
+ if (!subflow->conn)
+ return false;
+
+ msk = mptcp_sk(subflow->conn);
+ for (;;) {
+ u32 map_remaining;
+ size_t delta;
+ u64 ack_seq;
+ u64 old_ack;
+
+ status = get_mapping_status(ssk);
+ pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
+ if (status == MAPPING_INVALID) {
+ ssk->sk_err = EBADMSG;
+ goto fatal;
+ }
+
+ if (status != MAPPING_OK)
+ return false;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (WARN_ON_ONCE(!skb))
+ return false;
+
+ /* if msk lacks the remote key, this subflow must provide an
+ * MP_CAPABLE-based mapping
+ */
+ if (unlikely(!READ_ONCE(msk->can_ack))) {
+ if (!subflow->mpc_map) {
+ ssk->sk_err = EBADMSG;
+ goto fatal;
+ }
+ WRITE_ONCE(msk->remote_key, subflow->remote_key);
+ WRITE_ONCE(msk->ack_seq, subflow->map_seq);
+ WRITE_ONCE(msk->can_ack, true);
+ }
+
+ old_ack = READ_ONCE(msk->ack_seq);
+ ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
+ ack_seq);
+ if (ack_seq == old_ack)
+ break;
+
+ /* only accept in-sequence mapping. Old values are spurious
+ * retransmission; we can hit "future" values on active backup
+ * subflow switch, we relay on retransmissions to get
+ * in-sequence data.
+ * Cuncurrent subflows support will require subflow data
+ * reordering
+ */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+ if (before64(ack_seq, old_ack))
+ delta = min_t(size_t, old_ack - ack_seq, map_remaining);
+ else
+ delta = min_t(size_t, ack_seq - old_ack, map_remaining);
+
+ /* discard mapped data */
+ pr_debug("discarding %zu bytes, current map len=%d", delta,
+ map_remaining);
+ if (delta) {
+ struct mptcp_read_arg arg = {
+ .msg = NULL,
+ };
+ read_descriptor_t desc = {
+ .count = delta,
+ .arg.data = &arg,
+ };
+ int ret;
+
+ ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
+ if (ret < 0) {
+ ssk->sk_err = -ret;
+ goto fatal;
+ }
+ if (ret < delta)
+ return false;
+ if (delta == map_remaining)
+ subflow->map_valid = 0;
+ }
+ }
+ return true;
+
+fatal:
+ /* fatal protocol error, close the socket */
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+ ssk->sk_error_report(ssk);
+ tcp_set_state(ssk, TCP_CLOSE);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ return false;
+}
+
+bool mptcp_subflow_data_available(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sk_buff *skb;
+
+ /* check if current mapping is still valid */
+ if (subflow->map_valid &&
+ mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
+ subflow->map_valid = 0;
+ subflow->data_avail = 0;
+
+ pr_debug("Done with mapping: seq=%u data_len=%u",
+ subflow->map_subflow_seq,
+ subflow->map_data_len);
+ }
+
+ if (!subflow_check_data_avail(sk)) {
+ subflow->data_avail = 0;
+ return false;
+ }
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ subflow->data_avail = skb &&
+ before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
+ return subflow->data_avail;
+}
+
+static void subflow_data_ready(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ if (!parent || !subflow->mp_capable) {
+ subflow->tcp_data_ready(sk);
+
+ if (parent)
+ parent->sk_data_ready(parent);
+ return;
+ }
+
+ if (mptcp_subflow_data_available(sk)) {
+ set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
+
+ parent->sk_data_ready(parent);
+ }
+}
+
+static void subflow_write_space(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ sk_stream_write_space(sk);
+ if (parent && sk_stream_is_writeable(sk)) {
+ set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
+ smp_mb__after_atomic();
+ /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
+ sk_stream_write_space(parent);
+ }
+}
+
+static struct inet_connection_sock_af_ops *
+subflow_default_af_ops(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return &subflow_v6_specific;
+#endif
+ return &subflow_specific;
+}
+
+void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock_af_ops *target;
+
+ target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
+
+ pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+ subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
+
+ if (likely(icsk->icsk_af_ops == target))
+ return;
+
+ subflow->icsk_af_ops = icsk->icsk_af_ops;
+ icsk->icsk_af_ops = target;
+#endif
+}
+
+int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
+{
+ struct mptcp_subflow_context *subflow;
+ struct net *net = sock_net(sk);
+ struct socket *sf;
+ int err;
+
+ err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
+ &sf);
+ if (err)
+ return err;
+
+ lock_sock(sf->sk);
+
+ /* kernel sockets do not by default acquire net ref, but TCP timer
+ * needs it.
+ */
+ sf->sk->sk_net_refcnt = 1;
+ get_net(net);
+ this_cpu_add(*net->core.sock_inuse, 1);
+ err = tcp_set_ulp(sf->sk, "mptcp");
+ release_sock(sf->sk);
+
+ if (err)
+ return err;
+
+ subflow = mptcp_subflow_ctx(sf->sk);
+ pr_debug("subflow=%p", subflow);
+
+ *new_sock = sf;
+ sock_hold(sk);
+ subflow->conn = sk;
+
+ return 0;
+}
+
+static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
+ gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct mptcp_subflow_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), priority);
+ if (!ctx)
+ return NULL;
+
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ INIT_LIST_HEAD(&ctx->node);
+
+ pr_debug("subflow=%p", ctx);
+
+ ctx->tcp_sock = sk;
+
+ return ctx;
+}
+
+static void __subflow_state_change(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static bool subflow_is_done(const struct sock *sk)
+{
+ return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
+}
+
+static void subflow_state_change(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = READ_ONCE(subflow->conn);
+
+ __subflow_state_change(sk);
+
+ /* as recvmsg() does not acquire the subflow socket for ssk selection
+ * a fin packet carrying a DSS can be unnoticed if we don't trigger
+ * the data available machinery here.
+ */
+ if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
+ set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
+
+ parent->sk_data_ready(parent);
+ }
+
+ if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
+ !subflow->rx_eof && subflow_is_done(sk)) {
+ subflow->rx_eof = 1;
+ parent->sk_shutdown |= RCV_SHUTDOWN;
+ __subflow_state_change(parent);
+ }
+}
+
+static int subflow_ulp_init(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct mptcp_subflow_context *ctx;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = 0;
+
+ /* disallow attaching ULP to a socket unless it has been
+ * created with sock_create_kern()
+ */
+ if (!sk->sk_kern_sock) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ctx = subflow_create_ctx(sk, GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
+
+ tp->is_mptcp = 1;
+ ctx->icsk_af_ops = icsk->icsk_af_ops;
+ icsk->icsk_af_ops = subflow_default_af_ops(sk);
+ ctx->tcp_data_ready = sk->sk_data_ready;
+ ctx->tcp_state_change = sk->sk_state_change;
+ ctx->tcp_write_space = sk->sk_write_space;
+ sk->sk_data_ready = subflow_data_ready;
+ sk->sk_write_space = subflow_write_space;
+ sk->sk_state_change = subflow_state_change;
+out:
+ return err;
+}
+
+static void subflow_ulp_release(struct sock *sk)
+{
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
+
+ if (!ctx)
+ return;
+
+ if (ctx->conn)
+ sock_put(ctx->conn);
+
+ kfree_rcu(ctx, rcu);
+}
+
+static void subflow_ulp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *old_ctx)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ mptcp_subflow_tcp_fallback(sk, old_ctx);
+ icsk->icsk_ulp_ops = NULL;
+ rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+ tcp_sk(sk)->is_mptcp = 0;
+}
+
+static void subflow_ulp_clone(const struct request_sock *req,
+ struct sock *newsk,
+ const gfp_t priority)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
+ struct mptcp_subflow_context *new_ctx;
+
+ if (!subflow_req->mp_capable) {
+ subflow_ulp_fallback(newsk, old_ctx);
+ return;
+ }
+
+ new_ctx = subflow_create_ctx(newsk, priority);
+ if (!new_ctx) {
+ subflow_ulp_fallback(newsk, old_ctx);
+ return;
+ }
+
+ /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
+ * established only after we receive the remote key
+ */
+ new_ctx->conn_finished = 1;
+ new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
+ new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
+ new_ctx->tcp_state_change = old_ctx->tcp_state_change;
+ new_ctx->tcp_write_space = old_ctx->tcp_write_space;
+ new_ctx->mp_capable = 1;
+ new_ctx->fourth_ack = subflow_req->remote_key_valid;
+ new_ctx->can_ack = subflow_req->remote_key_valid;
+ new_ctx->remote_key = subflow_req->remote_key;
+ new_ctx->local_key = subflow_req->local_key;
+ new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
+}
+
+static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
+ .name = "mptcp",
+ .owner = THIS_MODULE,
+ .init = subflow_ulp_init,
+ .release = subflow_ulp_release,
+ .clone = subflow_ulp_clone,
+};
+
+static int subflow_ops_init(struct request_sock_ops *subflow_ops)
+{
+ subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
+ subflow_ops->slab_name = "request_sock_subflow";
+
+ subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
+ subflow_ops->obj_size, 0,
+ SLAB_ACCOUNT |
+ SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ if (!subflow_ops->slab)
+ return -ENOMEM;
+
+ subflow_ops->destructor = subflow_req_destructor;
+
+ return 0;
+}
+
+void mptcp_subflow_init(void)
+{
+ subflow_request_sock_ops = tcp_request_sock_ops;
+ if (subflow_ops_init(&subflow_request_sock_ops) != 0)
+ panic("MPTCP: failed to init subflow request sock ops\n");
+
+ subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+ subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
+
+ subflow_specific = ipv4_specific;
+ subflow_specific.conn_request = subflow_v4_conn_request;
+ subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
+ subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_specific.rebuild_header = subflow_rebuild_header;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+ subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
+
+ subflow_v6_specific = ipv6_specific;
+ subflow_v6_specific.conn_request = subflow_v6_conn_request;
+ subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
+ subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_v6_specific.rebuild_header = subflow_rebuild_header;
+
+ subflow_v6m_specific = subflow_v6_specific;
+ subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
+ subflow_v6m_specific.send_check = ipv4_specific.send_check;
+ subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
+ subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
+ subflow_v6m_specific.net_frag_header_len = 0;
+#endif
+
+ if (tcp_register_ulp(&subflow_ulp_ops) != 0)
+ panic("MPTCP: failed to register subflows to ULP\n");
+}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
new file mode 100644
index 000000000000..84d887806090
--- /dev/null
+++ b/net/mptcp/token.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP token management
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ *
+ * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org,
+ * authored by:
+ *
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ * Gregory Detal <gregory.detal@uclouvain.be>
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
+ * Andreas Ripke <ripke@neclab.eu>
+ * Vlad Dogaru <vlad.dogaru@intel.com>
+ * Octavian Purdila <octavian.purdila@intel.com>
+ * John Ronan <jronan@tssg.org>
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
+ * Brandon Heller <brandonh@stanford.edu>
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/radix-tree.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/protocol.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static RADIX_TREE(token_tree, GFP_ATOMIC);
+static RADIX_TREE(token_req_tree, GFP_ATOMIC);
+static DEFINE_SPINLOCK(token_tree_lock);
+static int token_used __read_mostly;
+
+/**
+ * mptcp_token_new_request - create new key/idsn/token for subflow_request
+ * @req - the request socket
+ *
+ * This function is called when a new mptcp connection is coming in.
+ *
+ * It creates a unique token to identify the new mptcp connection,
+ * a secret local key and the initial data sequence number (idsn).
+ *
+ * Returns 0 on success.
+ */
+int mptcp_token_new_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ int err;
+
+ while (1) {
+ u32 token;
+
+ mptcp_crypto_key_gen_sha(&subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
+ req, subflow_req->local_key, subflow_req->token,
+ subflow_req->idsn);
+
+ token = subflow_req->token;
+ spin_lock_bh(&token_tree_lock);
+ if (!radix_tree_lookup(&token_req_tree, token) &&
+ !radix_tree_lookup(&token_tree, token))
+ break;
+ spin_unlock_bh(&token_tree_lock);
+ }
+
+ err = radix_tree_insert(&token_req_tree,
+ subflow_req->token, &token_used);
+ spin_unlock_bh(&token_tree_lock);
+ return err;
+}
+
+/**
+ * mptcp_token_new_connect - create new key/idsn/token for subflow
+ * @sk - the socket that will initiate a connection
+ *
+ * This function is called when a new outgoing mptcp connection is
+ * initiated.
+ *
+ * It creates a unique token to identify the new mptcp connection,
+ * a secret local key and the initial data sequence number (idsn).
+ *
+ * On success, the mptcp connection can be found again using
+ * the computed token at a later time, this is needed to process
+ * join requests.
+ *
+ * returns 0 on success.
+ */
+int mptcp_token_new_connect(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *mptcp_sock = subflow->conn;
+ int err;
+
+ while (1) {
+ u32 token;
+
+ mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
+ &subflow->idsn);
+
+ pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
+ sk, subflow->local_key, subflow->token, subflow->idsn);
+
+ token = subflow->token;
+ spin_lock_bh(&token_tree_lock);
+ if (!radix_tree_lookup(&token_req_tree, token) &&
+ !radix_tree_lookup(&token_tree, token))
+ break;
+ spin_unlock_bh(&token_tree_lock);
+ }
+ err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock);
+ spin_unlock_bh(&token_tree_lock);
+
+ return err;
+}
+
+/**
+ * mptcp_token_new_accept - insert token for later processing
+ * @token: the token to insert to the tree
+ *
+ * Called when a SYN packet creates a new logical connection, i.e.
+ * is not a join request.
+ *
+ * We don't have an mptcp socket yet at that point.
+ * This is paired with mptcp_token_update_accept, called on accept().
+ */
+int mptcp_token_new_accept(u32 token)
+{
+ int err;
+
+ spin_lock_bh(&token_tree_lock);
+ err = radix_tree_insert(&token_tree, token, &token_used);
+ spin_unlock_bh(&token_tree_lock);
+
+ return err;
+}
+
+/**
+ * mptcp_token_update_accept - update token to map to mptcp socket
+ * @conn: the new struct mptcp_sock
+ * @sk: the initial subflow for this mptcp socket
+ *
+ * Called when the first mptcp socket is created on accept to
+ * refresh the dummy mapping (done to reserve the token) with
+ * the mptcp_socket structure that wasn't allocated before.
+ */
+void mptcp_token_update_accept(struct sock *sk, struct sock *conn)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ void __rcu **slot;
+
+ spin_lock_bh(&token_tree_lock);
+ slot = radix_tree_lookup_slot(&token_tree, subflow->token);
+ WARN_ON_ONCE(!slot);
+ if (slot) {
+ WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used);
+ radix_tree_replace_slot(&token_tree, slot, conn);
+ }
+ spin_unlock_bh(&token_tree_lock);
+}
+
+/**
+ * mptcp_token_destroy_request - remove mptcp connection/token
+ * @token - token of mptcp connection to remove
+ *
+ * Remove not-yet-fully-established incoming connection identified
+ * by @token.
+ */
+void mptcp_token_destroy_request(u32 token)
+{
+ spin_lock_bh(&token_tree_lock);
+ radix_tree_delete(&token_req_tree, token);
+ spin_unlock_bh(&token_tree_lock);
+}
+
+/**
+ * mptcp_token_destroy - remove mptcp connection/token
+ * @token - token of mptcp connection to remove
+ *
+ * Remove the connection identified by @token.
+ */
+void mptcp_token_destroy(u32 token)
+{
+ spin_lock_bh(&token_tree_lock);
+ radix_tree_delete(&token_tree, token);
+ spin_unlock_bh(&token_tree_lock);
+}