diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/Kconfig | 26 | ||||
-rw-r--r-- | net/mptcp/Makefile | 4 | ||||
-rw-r--r-- | net/mptcp/crypto.c | 152 | ||||
-rw-r--r-- | net/mptcp/ctrl.c | 130 | ||||
-rw-r--r-- | net/mptcp/options.c | 586 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 1276 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 240 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 860 | ||||
-rw-r--r-- | net/mptcp/token.c | 195 |
9 files changed, 3469 insertions, 0 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 index 000000000000..5db56d2218c5 --- /dev/null +++ b/net/mptcp/Kconfig @@ -0,0 +1,26 @@ + +config MPTCP + bool "MPTCP: Multipath TCP" + depends on INET + select SKB_EXTENSIONS + select CRYPTO_LIB_SHA256 + help + Multipath TCP (MPTCP) connections send and receive data over multiple + subflows in order to utilize multiple network paths. Each subflow + uses the TCP protocol, and TCP options carry header information for + MPTCP. + +config MPTCP_IPV6 + bool "MPTCP: IPv6 support for Multipath TCP" + depends on MPTCP + select IPV6 + default y + +config MPTCP_HMAC_TEST + bool "Tests for MPTCP HMAC implementation" + default n + help + This option enable boot time self-test for the HMAC implementation + used by the MPTCP code + + Say N if you are unsure. diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile new file mode 100644 index 000000000000..4e98d9edfd0a --- /dev/null +++ b/net/mptcp/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MPTCP) += mptcp.o + +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c new file mode 100644 index 000000000000..40d1bb18fd60 --- /dev/null +++ b/net/mptcp/crypto.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP cryptographic functions + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and + * mptcp_ipv6 from multipath-tcp.org, authored by: + * + * Sébastien Barré <sebastien.barre@uclouvain.be> + * Christoph Paasch <christoph.paasch@uclouvain.be> + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> + * Gregory Detal <gregory.detal@uclouvain.be> + * Fabien Duchêne <fabien.duchene@uclouvain.be> + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> + * Lavkesh Lahngir <lavkesh51@gmail.com> + * Andreas Ripke <ripke@neclab.eu> + * Vlad Dogaru <vlad.dogaru@intel.com> + * Octavian Purdila <octavian.purdila@intel.com> + * John Ronan <jronan@tssg.org> + * Catalin Nicutar <catalin.nicutar@gmail.com> + * Brandon Heller <brandonh@stanford.edu> + */ + +#include <linux/kernel.h> +#include <crypto/sha.h> +#include <asm/unaligned.h> + +#include "protocol.h" + +#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) +{ + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be64 input = cpu_to_be64(key); + struct sha256_state state; + + sha256_init(&state); + sha256_update(&state, (__force u8 *)&input, sizeof(input)); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + if (token) + *token = be32_to_cpu(mptcp_hashed_key[0]); + if (idsn) + *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hmac) +{ + u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be32 *hash_out = (__force __be32 *)hmac; + struct sha256_state state; + u8 key1be[8]; + u8 key2be[8]; + int i; + + put_unaligned_be64(key1, key1be); + put_unaligned_be64(key2, key2be); + + /* Generate key xored with ipad */ + memset(input, 0x36, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); + put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); + + /* emit sha256(K1 || msg) on the second input block, so we can + * reuse 'input' for the last hashing + */ + sha256_final(&state, &input[SHA256_BLOCK_SIZE]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + /* takes only first 160 bits */ + for (i = 0; i < 5; i++) + hash_out[i] = mptcp_hashed_key[i]; +} + +#ifdef CONFIG_MPTCP_HMAC_TEST +struct test_cast { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size, and we truncate the output. + */ +static struct test_cast tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da67", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", + }, +}; + +static int __init test_mptcp_crypto(void) +{ + char hmac[20], hmac_hex[41]; + u32 nonce1, nonce2; + u64 key1, key2; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + for (j = 0; j < 20; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[40] = 0; + + if (memcmp(hmac_hex, tests[i].result, 40)) + pr_err("test %d failed, got %s expected %s", i, + hmac_hex, tests[i].result); + else + pr_info("test %d [ ok ]", i); + } + return 0; +} + +late_initcall(test_mptcp_crypto); +#endif diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c new file mode 100644 index 000000000000..8e39585d37f3 --- /dev/null +++ b/net/mptcp/ctrl.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Tessares SA. + */ + +#include <linux/sysctl.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include "protocol.h" + +#define MPTCP_SYSCTL_PATH "net/mptcp" + +static int mptcp_pernet_id; +struct mptcp_pernet { + struct ctl_table_header *ctl_table_hdr; + + int mptcp_enabled; +}; + +static struct mptcp_pernet *mptcp_get_pernet(struct net *net) +{ + return net_generic(net, mptcp_pernet_id); +} + +int mptcp_is_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->mptcp_enabled; +} + +static struct ctl_table mptcp_sysctl_table[] = { + { + .procname = "enabled", + .maxlen = sizeof(int), + .mode = 0644, + /* users with CAP_NET_ADMIN or root (not and) can change this + * value, same as other sysctl or the 'net' tree. + */ + .proc_handler = proc_dointvec, + }, + {} +}; + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; +} + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = mptcp_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); + if (!table) + goto err_alloc; + } + + table[0].data = &pernet->mptcp_enabled; + + hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); + if (!hdr) + goto err_reg; + + pernet->ctl_table_hdr = hdr; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) +{ + struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + + unregister_net_sysctl_table(pernet->ctl_table_hdr); + + kfree(table); +} + +static int __net_init mptcp_net_init(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_set_defaults(pernet); + + return mptcp_pernet_new_table(net, pernet); +} + +/* Note: the callback will only be called per extra netns */ +static void __net_exit mptcp_net_exit(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_del_table(pernet); +} + +static struct pernet_operations mptcp_pernet_ops = { + .init = mptcp_net_init, + .exit = mptcp_net_exit, + .id = &mptcp_pernet_id, + .size = sizeof(struct mptcp_pernet), +}; + +void __init mptcp_init(void) +{ + mptcp_proto_init(); + + if (register_pernet_subsys(&mptcp_pernet_ops) < 0) + panic("Failed to register MPTCP pernet subsystem.\n"); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int __init mptcpv6_init(void) +{ + int err; + + err = mptcp_proto_v6_init(); + + return err; +} +#endif diff --git a/net/mptcp/options.c b/net/mptcp/options.c new file mode 100644 index 000000000000..45acd877bef3 --- /dev/null +++ b/net/mptcp/options.c @@ -0,0 +1,586 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#include <linux/kernel.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static bool mptcp_cap_flag_sha256(u8 flags) +{ + return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; +} + +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx) +{ + struct mptcp_options_received *mp_opt = &opt_rx->mptcp; + u8 subtype = *ptr >> 4; + int expected_opsize; + u8 version; + u8 flags; + + switch (subtype) { + case MPTCPOPT_MP_CAPABLE: + /* strict size checking */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + if (skb->len > tcp_hdr(skb)->doff << 2) + expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + } else { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; + else + expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + } + if (opsize != expected_opsize) + break; + + /* try to be gentle vs future versions on the initial syn */ + version = *ptr++ & MPTCP_VERSION_MASK; + if (opsize != TCPOLEN_MPTCP_MPC_SYN) { + if (version != MPTCP_SUPPORTED_VERSION) + break; + } else if (version < MPTCP_SUPPORTED_VERSION) { + break; + } + + flags = *ptr++; + if (!mptcp_cap_flag_sha256(flags) || + (flags & MPTCP_CAP_EXTENSIBILITY)) + break; + + /* RFC 6824, Section 3.1: + * "For the Checksum Required bit (labeled "A"), if either + * host requires the use of checksums, checksums MUST be used. + * In other words, the only way for checksums not to be used + * is if both hosts in their SYNs set A=0." + * + * Section 3.3.0: + * "If a checksum is not present when its use has been + * negotiated, the receiver MUST close the subflow with a RST as + * it is considered broken." + * + * We don't implement DSS checksum - fall back to TCP. + */ + if (flags & MPTCP_CAP_CHECKSUM_REQD) + break; + + mp_opt->mp_capable = 1; + if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { + mp_opt->sndr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + /* Section 3.1.: + * "the data parameters in a MP_CAPABLE are semantically + * equivalent to those in a DSS option and can be used + * interchangeably." + */ + mp_opt->dss = 1; + mp_opt->use_map = 1; + mp_opt->mpc_map = 1; + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + version, flags, opsize, mp_opt->sndr_key, + mp_opt->rcvr_key, mp_opt->data_len); + break; + + case MPTCPOPT_DSS: + pr_debug("DSS"); + ptr++; + + /* we must clear 'mpc_map' be able to detect MP_CAPABLE + * map vs DSS map in mptcp_incoming_options(), and reconstruct + * map info accordingly + */ + mp_opt->mpc_map = 0; + flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; + mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; + mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; + mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; + mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; + mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); + + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + mp_opt->data_fin, mp_opt->dsn64, + mp_opt->use_map, mp_opt->ack64, + mp_opt->use_ack); + + expected_opsize = TCPOLEN_MPTCP_DSS_BASE; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) + expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) + expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; + } + + /* RFC 6824, Section 3.3: + * If a checksum is present, but its use had + * not been negotiated in the MP_CAPABLE handshake, + * the checksum field MUST be ignored. + */ + if (opsize != expected_opsize && + opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) + break; + + mp_opt->dss = 1; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) { + mp_opt->data_ack = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_ack = get_unaligned_be32(ptr); + ptr += 4; + } + + pr_debug("data_ack=%llu", mp_opt->data_ack); + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) { + mp_opt->data_seq = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_seq = get_unaligned_be32(ptr); + ptr += 4; + } + + mp_opt->subflow_seq = get_unaligned_be32(ptr); + ptr += 4; + + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + mp_opt->data_seq, mp_opt->subflow_seq, + mp_opt->data_len); + } + + break; + + default: + break; + } +} + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + + ptr = (const unsigned char *)(th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP) + mptcp_parse_option(skb, ptr, opsize, opt_rx); + ptr += opsize - 2; + length -= opsize; + } + } +} + +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + /* we will use snd_isn to detect first pkt [re]transmission + * in mptcp_established_options_mp() + */ + subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; + if (subflow->request_mptcp) { + pr_debug("local_key=%llu", subflow->local_key); + opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->sndr_key = subflow->local_key; + *size = TCPOLEN_MPTCP_MPC_SYN; + return true; + } + return false; +} + +void mptcp_rcv_synsent(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct tcp_sock *tp = tcp_sk(sk); + + pr_debug("subflow=%p", subflow); + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->can_ack = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + } else { + tcp_sk(sk)->is_mptcp = 0; + } +} + +static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_ext *mpext; + unsigned int data_len; + + pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, + subflow->fourth_ack, subflow->snd_isn, + skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + + if (subflow->mp_capable && !subflow->fourth_ack && skb && + subflow->snd_isn == TCP_SKB_CB(skb)->seq) { + /* When skb is not available, we better over-estimate the + * emitted options len. A full DSS option is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit + * that. + */ + mpext = mptcp_get_ext(skb); + data_len = mpext ? mpext->data_len : 0; + + /* we will check ext_copy.data_len in mptcp_write_options() to + * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and + * TCPOLEN_MPTCP_MPC_ACK + */ + opts->ext_copy.data_len = data_len; + opts->suboptions = OPTION_MPTCP_MPC_ACK; + opts->sndr_key = subflow->local_key; + opts->rcvr_key = subflow->remote_key; + + /* Section 3.1. + * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK + * packets that start the first subflow of an MPTCP connection, + * as well as the first packet that carries data + */ + if (data_len > 0) + *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); + else + *size = TCPOLEN_MPTCP_MPC_ACK; + + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + subflow, subflow->local_key, subflow->remote_key, + data_len); + + return true; + } + return false; +} + +static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, + struct mptcp_ext *ext) +{ + ext->data_fin = 1; + + if (!ext->use_map) { + /* RFC6824 requires a DSS mapping with specific values + * if DATA_FIN is set but no data payload is mapped + */ + ext->use_map = 1; + ext->dsn64 = 1; + ext->data_seq = mptcp_sk(subflow->conn)->write_seq; + ext->subflow_seq = 0; + ext->data_len = 1; + } else { + /* If there's an existing DSS mapping, DATA_FIN consumes + * 1 additional byte of mapping space. + */ + ext->data_len++; + } +} + +static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + unsigned int dss_size = 0; + struct mptcp_ext *mpext; + struct mptcp_sock *msk; + unsigned int ack_size; + bool ret = false; + u8 tcp_fin; + + if (skb) { + mpext = mptcp_get_ext(skb); + tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + } else { + mpext = NULL; + tcp_fin = 0; + } + + if (!skb || (mpext && mpext->use_map) || tcp_fin) { + unsigned int map_size; + + map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + + remaining -= map_size; + dss_size = map_size; + if (mpext) + opts->ext_copy = *mpext; + + if (skb && tcp_fin && + subflow->conn->sk_state != TCP_ESTABLISHED) + mptcp_write_data_fin(subflow, &opts->ext_copy); + ret = true; + } + + opts->ext_copy.use_ack = 0; + msk = mptcp_sk(subflow->conn); + if (!msk || !READ_ONCE(msk->can_ack)) { + *size = ALIGN(dss_size, 4); + return ret; + } + + ack_size = TCPOLEN_MPTCP_DSS_ACK64; + + /* Add kind/length/subtype/flag overhead if mapping is not populated */ + if (dss_size == 0) + ack_size += TCPOLEN_MPTCP_DSS_BASE; + + dss_size += ack_size; + + opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.ack64 = 1; + opts->ext_copy.use_ack = 1; + + *size = ALIGN(dss_size, 4); + return true; +} + +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts) +{ + unsigned int opt_size = 0; + bool ret = false; + + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) + ret = true; + else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, + opts)) + ret = true; + + /* we reserved enough space for the above options, and exceeding the + * TCP option space would be fatal + */ + if (WARN_ON_ONCE(opt_size > remaining)) + return false; + + *size += opt_size; + remaining -= opt_size; + + return ret; +} + +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + if (subflow_req->mp_capable) { + opts->suboptions = OPTION_MPTCP_MPC_SYNACK; + opts->sndr_key = subflow_req->local_key; + *size = TCPOLEN_MPTCP_MPC_SYNACK; + pr_debug("subflow_req=%p, local_key=%llu", + subflow_req, subflow_req->local_key); + return true; + } + return false; +} + +static bool check_fourth_ack(struct mptcp_subflow_context *subflow, + struct sk_buff *skb, + struct mptcp_options_received *mp_opt) +{ + /* here we can process OoO, in-window pkts, only in-sequence 4th ack + * are relevant + */ + if (likely(subflow->fourth_ack || + TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) + return true; + + if (mp_opt->use_ack) + subflow->fourth_ack = 1; + + if (subflow->can_ack) + return true; + + /* If the first established packet does not contain MP_CAPABLE + data + * then fallback to TCP + */ + if (!mp_opt->mp_capable) { + subflow->mp_capable = 0; + tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; + return false; + } + subflow->remote_key = mp_opt->sndr_key; + subflow->can_ack = 1; + return true; +} + +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_options_received *mp_opt; + struct mptcp_ext *mpext; + + mp_opt = &opt_rx->mptcp; + if (!check_fourth_ack(subflow, skb, mp_opt)) + return; + + if (!mp_opt->dss) + return; + + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (!mpext) + return; + + memset(mpext, 0, sizeof(*mpext)); + + if (mp_opt->use_map) { + if (mp_opt->mpc_map) { + /* this is an MP_CAPABLE carrying MPTCP data + * we know this map the first chunk of data + */ + mptcp_crypto_key_sha(subflow->remote_key, NULL, + &mpext->data_seq); + mpext->data_seq++; + mpext->subflow_seq = 1; + mpext->dsn64 = 1; + mpext->mpc_map = 1; + } else { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->dsn64 = mp_opt->dsn64; + } + mpext->data_len = mp_opt->data_len; + mpext->use_map = 1; + } + + if (mp_opt->use_ack) { + mpext->data_ack = mp_opt->data_ack; + mpext->use_ack = 1; + mpext->ack64 = mp_opt->ack64; + } + + mpext->data_fin = mp_opt->data_fin; +} + +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +{ + if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | + OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + u8 len; + + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYN; + else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYNACK; + else if (opts->ext_copy.data_len) + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + len = TCPOLEN_MPTCP_MPC_ACK; + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | + (MPTCPOPT_MP_CAPABLE << 12) | + (MPTCP_SUPPORTED_VERSION << 8) | + MPTCP_CAP_HMAC_SHA256); + + if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & + opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->sndr_key, ptr); + ptr += 2; + if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + if (!opts->ext_copy.data_len) + goto mp_capable_done; + + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + ptr += 1; + } + +mp_capable_done: + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { + struct mptcp_ext *mpext = &opts->ext_copy; + u8 len = TCPOLEN_MPTCP_DSS_BASE; + u8 flags = 0; + + if (mpext->use_ack) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; + } + + if (mpext->use_map) { + len += TCPOLEN_MPTCP_DSS_MAP64; + + /* Use only 64-bit mapping flags for now, add + * support for optional 32-bit mappings later. + */ + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; + if (mpext->data_fin) + flags |= MPTCP_DSS_DATA_FIN; + } + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | + (len << 16) | + (MPTCPOPT_DSS << 12) | + (flags)); + + if (mpext->use_ack) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } + + if (mpext->use_map) { + put_unaligned_be64(mpext->data_seq, ptr); + ptr += 2; + put_unaligned_be32(mpext->subflow_seq, ptr); + ptr += 1; + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } + } +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c new file mode 100644 index 000000000000..39fdca79ce90 --- /dev/null +++ b/net/mptcp/protocol.c @@ -0,0 +1,1276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/sched/signal.h> +#include <linux/atomic.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/protocol.h> +#include <net/tcp.h> +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include <net/transp_v6.h> +#endif +#include <net/mptcp.h> +#include "protocol.h" + +#define MPTCP_SAME_STATE TCP_MAX_STATES + +static void __mptcp_close(struct sock *sk, long timeout); + +static const struct proto_ops *tcp_proto_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return &inet6_stream_ops; +#endif + return &inet_stream_ops; +} + +/* MP_CAPABLE handshake failed, convert msk to plain tcp, replacing + * socket->sk and stream ops and destroying msk + * return the msk socket, as we can't access msk anymore after this function + * completes + * Called with msk lock held, releases such lock before returning + */ +static struct socket *__mptcp_fallback_to_tcp(struct mptcp_sock *msk, + struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct socket *sock; + struct sock *sk; + + sk = (struct sock *)msk; + sock = sk->sk_socket; + subflow = mptcp_subflow_ctx(ssk); + + /* detach the msk socket */ + list_del_init(&subflow->node); + sock_orphan(sk); + sock->sk = NULL; + + /* socket is now TCP */ + lock_sock(ssk); + sock_graft(ssk, sock); + if (subflow->conn) { + /* We can't release the ULP data on a live socket, + * restore the tcp callback + */ + mptcp_subflow_tcp_fallback(ssk, subflow); + sock_put(subflow->conn); + subflow->conn = NULL; + } + release_sock(ssk); + sock->ops = tcp_proto_ops(ssk); + + /* destroy the left-over msk sock */ + __mptcp_close(sk, 0); + return sock; +} + +/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not + * completed yet or has failed, return the subflow socket. + * Otherwise return NULL. + */ +static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +{ + if (!msk->subflow || READ_ONCE(msk->can_ack)) + return NULL; + + return msk->subflow; +} + +static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) +{ + return msk->first && !sk_is_mptcp(msk->first); +} + +/* if the mp_capable handshake has failed, it fallbacks msk to plain TCP, + * releases the socket lock and returns a reference to the now TCP socket. + * Otherwise returns NULL + */ +static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); + + if (likely(!__mptcp_needs_tcp_fallback(msk))) + return NULL; + + if (msk->subflow) { + /* the first subflow is an active connection, discart the + * paired socket + */ + msk->subflow->sk = NULL; + sock_release(msk->subflow); + msk->subflow = NULL; + } + + return __mptcp_fallback_to_tcp(msk, msk->first); +} + +static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) +{ + return !msk->first; +} + +static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int err; + + ssock = __mptcp_nmpc_socket(msk); + if (ssock) + goto set_state; + + if (!__mptcp_can_create_subflow(msk)) + return ERR_PTR(-EINVAL); + + err = mptcp_subflow_create_socket(sk, &ssock); + if (err) + return ERR_PTR(err); + + msk->first = ssock->sk; + msk->subflow = ssock; + subflow = mptcp_subflow_ctx(ssock->sk); + list_add(&subflow->node, &msk->conn_list); + subflow->request_mptcp = 1; + +set_state: + if (state != MPTCP_SAME_STATE) + inet_sk_state_store(sk, state); + return ssock; +} + +static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + +static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) +{ + if (!msk->cached_ext) + msk->cached_ext = __skb_ext_alloc(); + + return !!msk->cached_ext; +} + +static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + sock_owned_by_me(sk); + + mptcp_for_each_subflow(msk, subflow) { + if (subflow->data_avail) + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + +static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) +{ + if (!tcp_skb_can_collapse_to(skb)) + return false; + + /* can collapse only if MPTCP level sequence is in order */ + return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; +} + +static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, + struct msghdr *msg, long *timeo, int *pmss_now, + int *ps_goal) +{ + int mss_now, avail_size, size_goal, ret; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_ext *mpext = NULL; + struct sk_buff *skb, *tail; + bool can_collapse = false; + struct page_frag *pfrag; + size_t psize; + + /* use the mptcp page cache so that we can easily move the data + * from one substream to another, but do per subflow memory accounting + */ + pfrag = sk_page_frag(sk); + while (!sk_page_frag_refill(ssk, pfrag) || + !mptcp_ext_cache_refill(msk)) { + ret = sk_stream_wait_memory(ssk, timeo); + if (ret) + return ret; + if (unlikely(__mptcp_needs_tcp_fallback(msk))) + return 0; + } + + /* compute copy limit */ + mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); + *pmss_now = mss_now; + *ps_goal = size_goal; + avail_size = size_goal; + skb = tcp_write_queue_tail(ssk); + if (skb) { + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + + /* Limit the write to the size available in the + * current skb, if any, so that we create at most a new skb. + * Explicitly tells TCP internals to avoid collapsing on later + * queue management operation, to avoid breaking the ext <-> + * SSN association set here + */ + can_collapse = (size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(msk, skb, mpext); + if (!can_collapse) + TCP_SKB_CB(skb)->eor = 1; + else + avail_size = size_goal - skb->len; + } + psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, pfrag->offset, + min_t(size_t, msg_data_left(msg), psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + /* tell the TCP stack to delay the push so that we can safely + * access the skb after the sendpages call + */ + ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + msg->msg_flags | MSG_SENDPAGE_NOTLAST); + if (ret <= 0) + return ret; + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + /* if the tail skb extension is still the cached one, collapsing + * really happened. Note: we can't check for 'same skb' as the sk_buff + * hdr on tail can be transmitted, freed and re-allocated by the + * do_tcp_sendpages() call + */ + tail = tcp_write_queue_tail(ssk); + if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + WARN_ON_ONCE(!can_collapse); + mpext->data_len += ret; + goto out; + } + + skb = tcp_write_queue_tail(ssk); + mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); + msk->cached_ext = NULL; + + memset(mpext, 0, sizeof(*mpext)); + mpext->data_seq = msk->write_seq; + mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; + mpext->data_len = ret; + mpext->use_map = 1; + mpext->dsn64 = 1; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", + mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->dsn64); + +out: + pfrag->offset += ret; + msk->write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + + return ret; +} + +static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) +{ + struct socket *sock; + + if (likely(sk_stream_is_writeable(ssk))) + return; + + sock = READ_ONCE(ssk->sk_socket); + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + /* set NOSPACE only after clearing SEND_SPACE flag */ + set_bit(SOCK_NOSPACE, &sock->flags); + } +} + +static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +{ + int mss_now = 0, size_goal = 0, ret = 0; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + size_t copied = 0; + struct sock *ssk; + long timeo; + + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + return -EOPNOTSUPP; + + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) { +fallback: + pr_debug("fallback passthrough"); + ret = sock_sendmsg(ssock, msg); + return ret >= 0 ? ret + copied : (copied ? copied : ret); + } + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + pr_debug("conn_list->subflow=%p", ssk); + + lock_sock(ssk); + while (msg_data_left(msg)) { + ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + &size_goal); + if (ret < 0) + break; + if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { + release_sock(ssk); + ssock = __mptcp_tcp_fallback(msk); + goto fallback; + } + + copied += ret; + } + + if (copied) { + ret = copied; + tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + } + + ssk_check_wmem(msk, ssk); + release_sock(ssk); + release_sock(sk); + return ret; +} + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct mptcp_read_arg *arg = desc->arg.data; + size_t copy_len; + + copy_len = min(desc->count, len); + + if (likely(arg->msg)) { + int err; + + err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len); + if (err) { + pr_debug("error path"); + desc->error = err; + return err; + } + } else { + pr_debug("Flushing skb payload"); + } + + desc->count -= copy_len; + + pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count); + return copy_len; +} + +static void mptcp_wait_data(struct sock *sk, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct mptcp_sock *msk = mptcp_sk(sk); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + + sk_wait_event(sk, timeo, + test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); +} + +static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + bool more_data_avail = false; + struct mptcp_read_arg arg; + read_descriptor_t desc; + bool wait_data = false; + struct socket *ssock; + struct tcp_sock *tp; + bool done = false; + struct sock *ssk; + int copied = 0; + int target; + long timeo; + + if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) + return -EOPNOTSUPP; + + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) { +fallback: + pr_debug("fallback-read subflow=%p", + mptcp_subflow_ctx(ssock->sk)); + copied = sock_recvmsg(ssock, msg, flags); + return copied; + } + + arg.msg = msg; + desc.arg.data = &arg; + desc.error = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + + len = min_t(size_t, len, INT_MAX); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + while (!done) { + u32 map_remaining; + int bytes_read; + + ssk = mptcp_subflow_recv_lookup(msk); + pr_debug("msk=%p ssk=%p", msk, ssk); + if (!ssk) + goto wait_for_data; + + subflow = mptcp_subflow_ctx(ssk); + tp = tcp_sk(ssk); + + lock_sock(ssk); + do { + /* try to read as much data as available */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + desc.count = min_t(size_t, len - copied, map_remaining); + pr_debug("reading %zu bytes, copied %d", desc.count, + copied); + bytes_read = tcp_read_sock(ssk, &desc, + mptcp_read_actor); + if (bytes_read < 0) { + if (!copied) + copied = bytes_read; + done = true; + goto next; + } + + pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq, + msk->ack_seq + bytes_read); + msk->ack_seq += bytes_read; + copied += bytes_read; + if (copied >= len) { + done = true; + goto next; + } + if (tp->urg_data && tp->urg_seq == tp->copied_seq) { + pr_err("Urgent data present, cannot proceed"); + done = true; + goto next; + } +next: + more_data_avail = mptcp_subflow_data_available(ssk); + } while (more_data_avail && !done); + release_sock(ssk); + continue; + +wait_for_data: + more_data_avail = false; + + /* only the master socket status is relevant here. The exit + * conditions mirror closely tcp_recvmsg() + */ + if (copied >= target) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + copied = -ENOTCONN; + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + pr_debug("block timeout %ld", timeo); + wait_data = true; + mptcp_wait_data(sk, &timeo); + if (unlikely(__mptcp_tcp_fallback(msk))) + goto fallback; + } + + if (more_data_avail) { + if (!test_bit(MPTCP_DATA_READY, &msk->flags)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } else if (!wait_data) { + clear_bit(MPTCP_DATA_READY, &msk->flags); + + /* .. race-breaker: ssk might get new data after last + * data_available() returns false. + */ + ssk = mptcp_subflow_recv_lookup(msk); + if (unlikely(ssk)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } + + release_sock(sk); + return copied; +} + +/* subflow sockets can be either outgoing (connect) or incoming + * (accept). + * + * Outgoing subflows use in-kernel sockets. + * Incoming subflows do not have their own 'struct socket' allocated, + * so we need to use tcp_close() after detaching them from the mptcp + * parent socket. + */ +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + long timeout) +{ + struct socket *sock = READ_ONCE(ssk->sk_socket); + + list_del(&subflow->node); + + if (sock && sock != sk->sk_socket) { + /* outgoing subflow */ + sock_release(sock); + } else { + /* incoming subflow */ + tcp_close(ssk, timeout); + } +} + +static int __mptcp_init_sock(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + INIT_LIST_HEAD(&msk->conn_list); + __set_bit(MPTCP_SEND_SPACE, &msk->flags); + + msk->first = NULL; + + return 0; +} + +static int mptcp_init_sock(struct sock *sk) +{ + if (!mptcp_is_enabled(sock_net(sk))) + return -ENOPROTOOPT; + + return __mptcp_init_sock(sk); +} + +static void mptcp_subflow_shutdown(struct sock *ssk, int how) +{ + lock_sock(ssk); + + switch (ssk->sk_state) { + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* fall through */ + case TCP_SYN_SENT: + tcp_disconnect(ssk, O_NONBLOCK); + break; + default: + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + break; + } + + /* Wake up anyone sleeping in poll. */ + ssk->sk_state_change(ssk); + release_sock(ssk); +} + +/* Called with msk lock held, releases such lock before returning */ +static void __mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_token_destroy(msk->token); + inet_sk_state_store(sk, TCP_CLOSE); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + __mptcp_close_ssk(sk, ssk, subflow, timeout); + } + + if (msk->cached_ext) + __skb_ext_put(msk->cached_ext); + release_sock(sk); + sk_common_release(sk); +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __mptcp_close(sk, timeout); +} + +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); + struct ipv6_pinfo *msk6 = inet6_sk(msk); + + msk->sk_v6_daddr = ssk->sk_v6_daddr; + msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; + + if (msk6 && ssk6) { + msk6->saddr = ssk6->saddr; + msk6->flow_label = ssk6->flow_label; + } +#endif + + inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; + inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; + inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; + inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; + inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; + inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; +} + +static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, + bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *listener; + struct sock *newsk; + + listener = __mptcp_nmpc_socket(msk); + if (WARN_ON_ONCE(!listener)) { + *err = -EINVAL; + return NULL; + } + + pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); + newsk = inet_csk_accept(listener->sk, flags, err, kern); + if (!newsk) + return NULL; + + pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + + if (sk_is_mptcp(newsk)) { + struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + struct sock *ssk = newsk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(newsk); + lock_sock(sk); + + local_bh_disable(); + new_mptcp_sock = sk_clone_lock(sk, GFP_ATOMIC); + if (!new_mptcp_sock) { + *err = -ENOBUFS; + local_bh_enable(); + release_sock(sk); + mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1); + tcp_close(newsk, 0); + return NULL; + } + + __mptcp_init_sock(new_mptcp_sock); + + msk = mptcp_sk(new_mptcp_sock); + msk->local_key = subflow->local_key; + msk->token = subflow->token; + msk->subflow = NULL; + msk->first = newsk; + + mptcp_token_update_accept(newsk, new_mptcp_sock); + + msk->write_seq = subflow->idsn + 1; + if (subflow->can_ack) { + msk->can_ack = true; + msk->remote_key = subflow->remote_key; + mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); + ack_seq++; + msk->ack_seq = ack_seq; + } + newsk = new_mptcp_sock; + mptcp_copy_inaddrs(newsk, ssk); + list_add(&subflow->node, &msk->conn_list); + + /* will be fully established at mptcp_stream_accept() + * completion. + */ + inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); + bh_unlock_sock(new_mptcp_sock); + local_bh_enable(); + release_sock(sk); + + /* the subflow can already receive packet, avoid racing with + * the receive path and process the pending ones + */ + lock_sock(ssk); + subflow->rel_write_seq = 1; + subflow->tcp_sock = ssk; + subflow->conn = new_mptcp_sock; + if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue))) + mptcp_subflow_data_available(ssk); + release_sock(ssk); + } + + return newsk; +} + +static void mptcp_destroy(struct sock *sk) +{ +} + +static int mptcp_setsockopt(struct sock *sk, int level, int optname, + char __user *uoptval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + char __kernel *optval; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + /* will be treated as __user in tcp_setsockopt */ + optval = (char __kernel __force *)uoptval; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (!IS_ERR(ssock)) { + pr_debug("subflow=%p", ssock->sk); + ret = kernel_setsockopt(ssock, level, optname, optval, optlen); + } + release_sock(sk); + + return ret; +} + +static int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *uoptval, int __user *uoption) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + char __kernel *optval; + int ret = -EOPNOTSUPP; + int __kernel *option; + struct socket *ssock; + + /* will be treated as __user in tcp_getsockopt */ + optval = (char __kernel __force *)uoptval; + option = (int __kernel __force *)uoption; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of getsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (!IS_ERR(ssock)) { + pr_debug("subflow=%p", ssock->sk); + ret = kernel_getsockopt(ssock, level, optname, optval, option); + } + release_sock(sk); + + return ret; +} + +static int mptcp_get_port(struct sock *sk, unsigned short snum) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + + ssock = __mptcp_nmpc_socket(msk); + pr_debug("msk=%p, subflow=%p", msk, ssock); + if (WARN_ON_ONCE(!ssock)) + return -EINVAL; + + return inet_csk_get_port(ssock->sk, snum); +} + +void mptcp_finish_connect(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + + if (!subflow->mp_capable) + return; + + sk = subflow->conn; + msk = mptcp_sk(sk); + + pr_debug("msk=%p, token=%u", sk, subflow->token); + + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); + ack_seq++; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; + subflow->rel_write_seq = 1; + + /* the socket is not connected yet, no msk/subflow ops can access/race + * accessing the field below + */ + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->local_key, subflow->local_key); + WRITE_ONCE(msk->token, subflow->token); + WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->can_ack, 1); +} + +static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +{ + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_pointer(sk->sk_wq, &parent->wq); + sk_set_socket(sk, parent); + sk->sk_uid = SOCK_INODE(parent)->i_uid; + write_unlock_bh(&sk->sk_callback_lock); +} + +static bool mptcp_memory_free(const struct sock *sk, int wake) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; +} + +static struct proto mptcp_prot = { + .name = "MPTCP", + .owner = THIS_MODULE, + .init = mptcp_init_sock, + .close = mptcp_close, + .accept = mptcp_accept, + .setsockopt = mptcp_setsockopt, + .getsockopt = mptcp_getsockopt, + .shutdown = tcp_shutdown, + .destroy = mptcp_destroy, + .sendmsg = mptcp_sendmsg, + .recvmsg = mptcp_recvmsg, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = mptcp_get_port, + .stream_memory_free = mptcp_memory_free, + .obj_size = sizeof(struct mptcp_sock), + .no_autobind = true, +}; + +static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->bind(ssock, uaddr, addr_len); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; +#endif + + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcp_prot) { + /* we are being invoked from __sys_accept4, after + * mptcp_accept() has just accepted a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + } + + return inet_getname(sock, uaddr, peer); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcpv6_prot) { + /* we are being invoked from __sys_accept4 after + * mptcp_accept() has accepted a non-mp-capable + * subflow: sk is a tcp_sk, not mptcp. + * + * Hand the socket over to tcp so all further + * socket ops bypass mptcp. + */ + sock->ops = &inet6_stream_ops; + } + + return inet6_getname(sock, uaddr, peer); +} +#endif + +static int mptcp_listen(struct socket *sock, int backlog) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_LISTEN); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->listen(ssock, backlog); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static bool is_tcp_proto(const struct proto *p) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + return p == &tcp_prot || p == &tcpv6_prot; +#else + return p == &tcp_prot; +#endif +} + +static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, + int flags, bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + if (sock->sk->sk_state != TCP_LISTEN) + goto unlock_fail; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto unlock_fail; + + sock_hold(ssock->sk); + release_sock(sock->sk); + + err = ssock->ops->accept(sock, newsock, flags, kern); + if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + struct mptcp_sock *msk = mptcp_sk(newsock->sk); + struct mptcp_subflow_context *subflow; + + /* set ssk->sk_socket of accept()ed flows to mptcp socket. + * This is needed so NOSPACE flag can be set from tcp stack. + */ + list_for_each_entry(subflow, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!ssk->sk_socket) + mptcp_sock_graft(ssk, newsock); + } + + inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); + } + + sock_put(ssock->sk); + return err; + +unlock_fail: + release_sock(sock->sk); + return -EINVAL; +} + +static __poll_t mptcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + struct sock *sk = sock->sk; + struct mptcp_sock *msk; + struct socket *ssock; + __poll_t mask = 0; + + msk = mptcp_sk(sk); + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (ssock) { + mask = ssock->ops->poll(file, ssock, wait); + release_sock(sk); + return mask; + } + + release_sock(sk); + sock_poll_wait(file, sock, wait); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock->ops->poll(file, ssock, NULL); + + if (test_bit(MPTCP_DATA_READY, &msk->flags)) + mask = EPOLLIN | EPOLLRDNORM; + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + + release_sock(sk); + + return mask; +} + +static int mptcp_shutdown(struct socket *sock, int how) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; + int ret = 0; + + pr_debug("sk=%p, how=%d", msk, how); + + lock_sock(sock->sk); + + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); + + how++; + + if ((how & ~SHUTDOWN_MASK) || !how) { + ret = -EINVAL; + goto out_unlock; + } + + if (sock->state == SS_CONNECTING) { + if ((1 << sock->sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(tcp_sk, how); + } + +out_unlock: + release_sock(sock->sk); + + return ret; +} + +static const struct proto_ops mptcp_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v4_getname, + .poll = mptcp_poll, + .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw mptcp_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_prot, + .ops = &mptcp_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +void mptcp_proto_init(void) +{ + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + + mptcp_subflow_init(); + + if (proto_register(&mptcp_prot, 1) != 0) + panic("Failed to register MPTCP proto.\n"); + + inet_register_protosw(&mptcp_protosw); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static const struct proto_ops mptcp_v6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v6_getname, + .poll = mptcp_poll, + .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet6_sendmsg, + .recvmsg = inet6_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct proto mptcp_v6_prot; + +static void mptcp_v6_destroy(struct sock *sk) +{ + mptcp_destroy(sk); + inet6_destroy_sock(sk); +} + +static struct inet_protosw mptcp_v6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_v6_prot, + .ops = &mptcp_v6_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +int mptcp_proto_v6_init(void) +{ + int err; + + mptcp_v6_prot = mptcp_prot; + strcpy(mptcp_v6_prot.name, "MPTCPv6"); + mptcp_v6_prot.slab = NULL; + mptcp_v6_prot.destroy = mptcp_v6_destroy; + mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) + + sizeof(struct ipv6_pinfo); + + err = proto_register(&mptcp_v6_prot, 1); + if (err) + return err; + + err = inet6_register_protosw(&mptcp_v6_protosw); + if (err) + proto_unregister(&mptcp_v6_prot); + + return err; +} +#endif diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h new file mode 100644 index 000000000000..8a99a2930284 --- /dev/null +++ b/net/mptcp/protocol.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __MPTCP_PROTOCOL_H +#define __MPTCP_PROTOCOL_H + +#include <linux/random.h> +#include <net/tcp.h> +#include <net/inet_connection_sock.h> + +#define MPTCP_SUPPORTED_VERSION 1 + +/* MPTCP option bits */ +#define OPTION_MPTCP_MPC_SYN BIT(0) +#define OPTION_MPTCP_MPC_SYNACK BIT(1) +#define OPTION_MPTCP_MPC_ACK BIT(2) + +/* MPTCP option subtypes */ +#define MPTCPOPT_MP_CAPABLE 0 +#define MPTCPOPT_MP_JOIN 1 +#define MPTCPOPT_DSS 2 +#define MPTCPOPT_ADD_ADDR 3 +#define MPTCPOPT_RM_ADDR 4 +#define MPTCPOPT_MP_PRIO 5 +#define MPTCPOPT_MP_FAIL 6 +#define MPTCPOPT_MP_FASTCLOSE 7 + +/* MPTCP suboption lengths */ +#define TCPOLEN_MPTCP_MPC_SYN 4 +#define TCPOLEN_MPTCP_MPC_SYNACK 12 +#define TCPOLEN_MPTCP_MPC_ACK 20 +#define TCPOLEN_MPTCP_MPC_ACK_DATA 22 +#define TCPOLEN_MPTCP_DSS_BASE 4 +#define TCPOLEN_MPTCP_DSS_ACK32 4 +#define TCPOLEN_MPTCP_DSS_ACK64 8 +#define TCPOLEN_MPTCP_DSS_MAP32 10 +#define TCPOLEN_MPTCP_DSS_MAP64 14 +#define TCPOLEN_MPTCP_DSS_CHECKSUM 2 + +/* MPTCP MP_CAPABLE flags */ +#define MPTCP_VERSION_MASK (0x0F) +#define MPTCP_CAP_CHECKSUM_REQD BIT(7) +#define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_HMAC_SHA256 BIT(0) +#define MPTCP_CAP_FLAG_MASK (0x3F) + +/* MPTCP DSS flags */ +#define MPTCP_DSS_DATA_FIN BIT(4) +#define MPTCP_DSS_DSN64 BIT(3) +#define MPTCP_DSS_HAS_MAP BIT(2) +#define MPTCP_DSS_ACK64 BIT(1) +#define MPTCP_DSS_HAS_ACK BIT(0) +#define MPTCP_DSS_FLAG_MASK (0x1F) + +/* MPTCP socket flags */ +#define MPTCP_DATA_READY BIT(0) +#define MPTCP_SEND_SPACE BIT(1) + +/* MPTCP connection sock */ +struct mptcp_sock { + /* inet_connection_sock must be the first member */ + struct inet_connection_sock sk; + u64 local_key; + u64 remote_key; + u64 write_seq; + u64 ack_seq; + u32 token; + unsigned long flags; + bool can_ack; + struct list_head conn_list; + struct skb_ext *cached_ext; /* for the next sendmsg */ + struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct sock *first; +}; + +#define mptcp_for_each_subflow(__msk, __subflow) \ + list_for_each_entry(__subflow, &((__msk)->conn_list), node) + +static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) +{ + return (struct mptcp_sock *)sk; +} + +struct mptcp_subflow_request_sock { + struct tcp_request_sock sk; + u16 mp_capable : 1, + mp_join : 1, + backup : 1, + remote_key_valid : 1; + u64 local_key; + u64 remote_key; + u64 idsn; + u32 token; + u32 ssn_offset; +}; + +static inline struct mptcp_subflow_request_sock * +mptcp_subflow_rsk(const struct request_sock *rsk) +{ + return (struct mptcp_subflow_request_sock *)rsk; +} + +/* MPTCP subflow context */ +struct mptcp_subflow_context { + struct list_head node;/* conn_list of subflows */ + u64 local_key; + u64 remote_key; + u64 idsn; + u64 map_seq; + u32 snd_isn; + u32 token; + u32 rel_write_seq; + u32 map_subflow_seq; + u32 ssn_offset; + u32 map_data_len; + u32 request_mptcp : 1, /* send MP_CAPABLE */ + mp_capable : 1, /* remote is MPTCP capable */ + fourth_ack : 1, /* send initial DSS */ + conn_finished : 1, + map_valid : 1, + mpc_map : 1, + data_avail : 1, + rx_eof : 1, + can_ack : 1; /* only after processing the remote a key */ + + struct sock *tcp_sock; /* tcp sk backpointer */ + struct sock *conn; /* parent mptcp_sock */ + const struct inet_connection_sock_af_ops *icsk_af_ops; + void (*tcp_data_ready)(struct sock *sk); + void (*tcp_state_change)(struct sock *sk); + void (*tcp_write_space)(struct sock *sk); + + struct rcu_head rcu; +}; + +static inline struct mptcp_subflow_context * +mptcp_subflow_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* Use RCU on icsk_ulp_data only for sock diag code */ + return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; +} + +static inline struct sock * +mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) +{ + return subflow->tcp_sock; +} + +static inline u64 +mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) +{ + return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - + subflow->ssn_offset - + subflow->map_subflow_seq; +} + +static inline u64 +mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) +{ + return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); +} + +int mptcp_is_enabled(struct net *net); +bool mptcp_subflow_data_available(struct sock *sk); +void mptcp_subflow_init(void); +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); + +static inline void mptcp_subflow_tcp_fallback(struct sock *sk, + struct mptcp_subflow_context *ctx) +{ + sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_state_change = ctx->tcp_state_change; + sk->sk_write_space = ctx->tcp_write_space; + + inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; +} + +extern const struct inet_connection_sock_af_ops ipv4_specific; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +extern const struct inet_connection_sock_af_ops ipv6_specific; +#endif + +void mptcp_proto_init(void); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcp_proto_v6_init(void); +#endif + +struct mptcp_read_arg { + struct msghdr *msg; +}; + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len); + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx); + +void mptcp_finish_connect(struct sock *sk); + +int mptcp_token_new_request(struct request_sock *req); +void mptcp_token_destroy_request(u32 token); +int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_accept(u32 token); +void mptcp_token_update_accept(struct sock *sk, struct sock *conn); +void mptcp_token_destroy(u32 token); + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); +static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hash_out); + +static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +{ + return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); +} + +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) + +#endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c new file mode 100644 index 000000000000..1662e1178949 --- /dev/null +++ b/net/mptcp/subflow.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/protocol.h> +#include <net/tcp.h> +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include <net/ip6_route.h> +#endif +#include <net/mptcp.h> +#include "protocol.h" + +static int subflow_rebuild_header(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + int err = 0; + + if (subflow->request_mptcp && !subflow->token) { + pr_debug("subflow=%p", sk); + err = mptcp_token_new_connect(sk); + } + + if (err) + return err; + + return subflow->icsk_af_ops->rebuild_header(sk); +} + +static void subflow_req_destructor(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + pr_debug("subflow_req=%p", subflow_req); + + if (subflow_req->mp_capable) + mptcp_token_destroy_request(subflow_req->token); + tcp_request_sock_ops.destructor(req); +} + +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct tcp_options_received rx_opt; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); + mptcp_get_options(skb, &rx_opt); + + subflow_req->mp_capable = 0; + subflow_req->remote_key_valid = 0; + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return; +#endif + + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { + int err; + + err = mptcp_token_new_request(req); + if (err == 0) + subflow_req->mp_capable = 1; + + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + } +} + +static void subflow_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static void subflow_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} +#endif + +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); + + if (subflow->conn && !subflow->conn_finished) { + pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), + subflow->remote_key); + mptcp_finish_connect(sk); + subflow->conn_finished = 1; + + if (skb) { + pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + } + } +} + +static struct request_sock_ops subflow_request_sock_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; + +static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + /* Never answer to SYNs sent to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv4_ops, + sk, skb); +drop: + tcp_listendrop(sk); + return 0; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; +static struct inet_connection_sock_af_ops subflow_v6_specific; +static struct inet_connection_sock_af_ops subflow_v6m_specific; + +static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + if (skb->protocol == htons(ETH_P_IP)) + return subflow_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv6_ops, sk, skb); + +drop: + tcp_listendrop(sk); + return 0; /* don't send reset */ +} +#endif + +static struct sock *subflow_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct mptcp_subflow_request_sock *subflow_req; + struct tcp_options_received opt_rx; + struct sock *child; + + pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + + /* if the sk is MP_CAPABLE, we try to fetch the client key */ + subflow_req = mptcp_subflow_rsk(req); + if (subflow_req->mp_capable) { + if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { + /* here we can receive and accept an in-window, + * out-of-order pkt, which will not carry the MP_CAPABLE + * opt even on mptcp enabled paths + */ + goto create_child; + } + + opt_rx.mptcp.mp_capable = 0; + mptcp_get_options(skb, &opt_rx); + if (opt_rx.mptcp.mp_capable) { + subflow_req->remote_key = opt_rx.mptcp.sndr_key; + subflow_req->remote_key_valid = 1; + } else { + subflow_req->mp_capable = 0; + } + } + +create_child: + child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); + + if (child && *own_req) { + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + + /* we have null ctx on TCP fallback, not fatal on MPC + * handshake + */ + if (!ctx) + return child; + + if (ctx->mp_capable) { + if (mptcp_token_new_accept(ctx->token)) + goto close_child; + } + } + + return child; + +close_child: + pr_debug("closing child socket"); + tcp_send_active_reset(child, GFP_ATOMIC); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return NULL; +} + +static struct inet_connection_sock_af_ops subflow_specific; + +enum mapping_status { + MAPPING_OK, + MAPPING_INVALID, + MAPPING_EMPTY, + MAPPING_DATA_FIN +}; + +static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) +{ + if ((u32)seq == (u32)old_seq) + return old_seq; + + /* Assume map covers data not mapped yet. */ + return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); +} + +static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +{ + WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); +} + +static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned int skb_consumed; + + skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(skb_consumed >= skb->len)) + return true; + + return skb->len - skb_consumed <= subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); +} + +static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + + if (unlikely(before(ssn, subflow->map_subflow_seq))) { + /* Mapping covers data later in the subflow stream, + * currently unsupported. + */ + warn_bad_map(subflow, ssn); + return false; + } + if (unlikely(!before(ssn, subflow->map_subflow_seq + + subflow->map_data_len))) { + /* Mapping does covers past subflow data, invalid */ + warn_bad_map(subflow, ssn + skb->len); + return false; + } + return true; +} + +static enum mapping_status get_mapping_status(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_ext *mpext; + struct sk_buff *skb; + u16 data_len; + u64 map_seq; + + skb = skb_peek(&ssk->sk_receive_queue); + if (!skb) + return MAPPING_EMPTY; + + mpext = mptcp_get_ext(skb); + if (!mpext || !mpext->use_map) { + if (!subflow->map_valid && !skb->len) { + /* the TCP stack deliver 0 len FIN pkt to the receive + * queue, that is the only 0len pkts ever expected here, + * and we can admit no mapping only for 0 len pkts + */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + WARN_ONCE(1, "0len seq %d:%d flags %x", + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + TCP_SKB_CB(skb)->tcp_flags); + sk_eat_skb(ssk, skb); + return MAPPING_EMPTY; + } + + if (!subflow->map_valid) + return MAPPING_INVALID; + + goto validate_seq; + } + + pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", + mpext->data_seq, mpext->dsn64, mpext->subflow_seq, + mpext->data_len, mpext->data_fin); + + data_len = mpext->data_len; + if (data_len == 0) { + pr_err("Infinite mapping not handled"); + return MAPPING_INVALID; + } + + if (mpext->data_fin == 1) { + if (data_len == 1) { + pr_debug("DATA_FIN with no payload"); + if (subflow->map_valid) { + /* A DATA_FIN might arrive in a DSS + * option before the previous mapping + * has been fully consumed. Continue + * handling the existing mapping. + */ + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } else { + return MAPPING_DATA_FIN; + } + } + + /* Adjust for DATA_FIN using 1 byte of sequence space */ + data_len--; + } + + if (!mpext->dsn64) { + map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, + mpext->data_seq); + pr_debug("expanded seq=%llu", subflow->map_seq); + } else { + map_seq = mpext->data_seq; + } + + if (subflow->map_valid) { + /* Allow replacing only with an identical map */ + if (subflow->map_seq == map_seq && + subflow->map_subflow_seq == mpext->subflow_seq && + subflow->map_data_len == data_len) { + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } + + /* If this skb data are fully covered by the current mapping, + * the new map would need caching, which is not supported + */ + if (skb_is_fully_mapped(ssk, skb)) + return MAPPING_INVALID; + + /* will validate the next map after consuming the current one */ + return MAPPING_OK; + } + + subflow->map_seq = map_seq; + subflow->map_subflow_seq = mpext->subflow_seq; + subflow->map_data_len = data_len; + subflow->map_valid = 1; + subflow->mpc_map = mpext->mpc_map; + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_seq, subflow->map_subflow_seq, + subflow->map_data_len); + +validate_seq: + /* we revalidate valid mapping on new skb, because we must ensure + * the current skb is completely covered by the available mapping + */ + if (!validate_mapping(ssk, skb)) + return MAPPING_INVALID; + + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; +} + +static bool subflow_check_data_avail(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + enum mapping_status status; + struct mptcp_sock *msk; + struct sk_buff *skb; + + pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, + subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); + if (subflow->data_avail) + return true; + + if (!subflow->conn) + return false; + + msk = mptcp_sk(subflow->conn); + for (;;) { + u32 map_remaining; + size_t delta; + u64 ack_seq; + u64 old_ack; + + status = get_mapping_status(ssk); + pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + if (status == MAPPING_INVALID) { + ssk->sk_err = EBADMSG; + goto fatal; + } + + if (status != MAPPING_OK) + return false; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return false; + + /* if msk lacks the remote key, this subflow must provide an + * MP_CAPABLE-based mapping + */ + if (unlikely(!READ_ONCE(msk->can_ack))) { + if (!subflow->mpc_map) { + ssk->sk_err = EBADMSG; + goto fatal; + } + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->ack_seq, subflow->map_seq); + WRITE_ONCE(msk->can_ack, true); + } + + old_ack = READ_ONCE(msk->ack_seq); + ack_seq = mptcp_subflow_get_mapped_dsn(subflow); + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + ack_seq); + if (ack_seq == old_ack) + break; + + /* only accept in-sequence mapping. Old values are spurious + * retransmission; we can hit "future" values on active backup + * subflow switch, we relay on retransmissions to get + * in-sequence data. + * Cuncurrent subflows support will require subflow data + * reordering + */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + if (before64(ack_seq, old_ack)) + delta = min_t(size_t, old_ack - ack_seq, map_remaining); + else + delta = min_t(size_t, ack_seq - old_ack, map_remaining); + + /* discard mapped data */ + pr_debug("discarding %zu bytes, current map len=%d", delta, + map_remaining); + if (delta) { + struct mptcp_read_arg arg = { + .msg = NULL, + }; + read_descriptor_t desc = { + .count = delta, + .arg.data = &arg, + }; + int ret; + + ret = tcp_read_sock(ssk, &desc, mptcp_read_actor); + if (ret < 0) { + ssk->sk_err = -ret; + goto fatal; + } + if (ret < delta) + return false; + if (delta == map_remaining) + subflow->map_valid = 0; + } + } + return true; + +fatal: + /* fatal protocol error, close the socket */ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + ssk->sk_error_report(ssk); + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + return false; +} + +bool mptcp_subflow_data_available(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sk_buff *skb; + + /* check if current mapping is still valid */ + if (subflow->map_valid && + mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { + subflow->map_valid = 0; + subflow->data_avail = 0; + + pr_debug("Done with mapping: seq=%u data_len=%u", + subflow->map_subflow_seq, + subflow->map_data_len); + } + + if (!subflow_check_data_avail(sk)) { + subflow->data_avail = 0; + return false; + } + + skb = skb_peek(&sk->sk_receive_queue); + subflow->data_avail = skb && + before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); + return subflow->data_avail; +} + +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + if (!parent || !subflow->mp_capable) { + subflow->tcp_data_ready(sk); + + if (parent) + parent->sk_data_ready(parent); + return; + } + + if (mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } +} + +static void subflow_write_space(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + sk_stream_write_space(sk); + if (parent && sk_stream_is_writeable(sk)) { + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); + smp_mb__after_atomic(); + /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ + sk_stream_write_space(parent); + } +} + +static struct inet_connection_sock_af_ops * +subflow_default_af_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (sk->sk_family == AF_INET6) + return &subflow_v6_specific; +#endif + return &subflow_specific; +} + +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock_af_ops *target; + + target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); + + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); + + if (likely(icsk->icsk_af_ops == target)) + return; + + subflow->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = target; +#endif +} + +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) +{ + struct mptcp_subflow_context *subflow; + struct net *net = sock_net(sk); + struct socket *sf; + int err; + + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, + &sf); + if (err) + return err; + + lock_sock(sf->sk); + + /* kernel sockets do not by default acquire net ref, but TCP timer + * needs it. + */ + sf->sk->sk_net_refcnt = 1; + get_net(net); + this_cpu_add(*net->core.sock_inuse, 1); + err = tcp_set_ulp(sf->sk, "mptcp"); + release_sock(sf->sk); + + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + pr_debug("subflow=%p", subflow); + + *new_sock = sf; + sock_hold(sk); + subflow->conn = sk; + + return 0; +} + +static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, + gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + + ctx = kzalloc(sizeof(*ctx), priority); + if (!ctx) + return NULL; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_LIST_HEAD(&ctx->node); + + pr_debug("subflow=%p", ctx); + + ctx->tcp_sock = sk; + + return ctx; +} + +static void __subflow_state_change(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +static void subflow_state_change(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = READ_ONCE(subflow->conn); + + __subflow_state_change(sk); + + /* as recvmsg() does not acquire the subflow socket for ssk selection + * a fin packet carrying a DSS can be unnoticed if we don't trigger + * the data available machinery here. + */ + if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } + + if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) && + !subflow->rx_eof && subflow_is_done(sk)) { + subflow->rx_eof = 1; + parent->sk_shutdown |= RCV_SHUTDOWN; + __subflow_state_change(parent); + } +} + +static int subflow_ulp_init(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + + /* disallow attaching ULP to a socket unless it has been + * created with sock_create_kern() + */ + if (!sk->sk_kern_sock) { + err = -EOPNOTSUPP; + goto out; + } + + ctx = subflow_create_ctx(sk, GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + + tp->is_mptcp = 1; + ctx->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = subflow_default_af_ops(sk); + ctx->tcp_data_ready = sk->sk_data_ready; + ctx->tcp_state_change = sk->sk_state_change; + ctx->tcp_write_space = sk->sk_write_space; + sk->sk_data_ready = subflow_data_ready; + sk->sk_write_space = subflow_write_space; + sk->sk_state_change = subflow_state_change; +out: + return err; +} + +static void subflow_ulp_release(struct sock *sk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + + if (!ctx) + return; + + if (ctx->conn) + sock_put(ctx->conn); + + kfree_rcu(ctx, rcu); +} + +static void subflow_ulp_fallback(struct sock *sk, + struct mptcp_subflow_context *old_ctx) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + mptcp_subflow_tcp_fallback(sk, old_ctx); + icsk->icsk_ulp_ops = NULL; + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tcp_sk(sk)->is_mptcp = 0; +} + +static void subflow_ulp_clone(const struct request_sock *req, + struct sock *newsk, + const gfp_t priority) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); + struct mptcp_subflow_context *new_ctx; + + if (!subflow_req->mp_capable) { + subflow_ulp_fallback(newsk, old_ctx); + return; + } + + new_ctx = subflow_create_ctx(newsk, priority); + if (!new_ctx) { + subflow_ulp_fallback(newsk, old_ctx); + return; + } + + /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully + * established only after we receive the remote key + */ + new_ctx->conn_finished = 1; + new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; + new_ctx->tcp_state_change = old_ctx->tcp_state_change; + new_ctx->tcp_write_space = old_ctx->tcp_write_space; + new_ctx->mp_capable = 1; + new_ctx->fourth_ack = subflow_req->remote_key_valid; + new_ctx->can_ack = subflow_req->remote_key_valid; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; + new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; +} + +static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { + .name = "mptcp", + .owner = THIS_MODULE, + .init = subflow_ulp_init, + .release = subflow_ulp_release, + .clone = subflow_ulp_clone, +}; + +static int subflow_ops_init(struct request_sock_ops *subflow_ops) +{ + subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); + subflow_ops->slab_name = "request_sock_subflow"; + + subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, + subflow_ops->obj_size, 0, + SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU, + NULL); + if (!subflow_ops->slab) + return -ENOMEM; + + subflow_ops->destructor = subflow_req_destructor; + + return 0; +} + +void mptcp_subflow_init(void) +{ + subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&subflow_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow request sock ops\n"); + + subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + + subflow_specific = ipv4_specific; + subflow_specific.conn_request = subflow_v4_conn_request; + subflow_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + + subflow_v6_specific = ipv6_specific; + subflow_v6_specific.conn_request = subflow_v6_conn_request; + subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_rebuild_header; + + subflow_v6m_specific = subflow_v6_specific; + subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; + subflow_v6m_specific.send_check = ipv4_specific.send_check; + subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; + subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; + subflow_v6m_specific.net_frag_header_len = 0; +#endif + + if (tcp_register_ulp(&subflow_ulp_ops) != 0) + panic("MPTCP: failed to register subflows to ULP\n"); +} diff --git a/net/mptcp/token.c b/net/mptcp/token.c new file mode 100644 index 000000000000..84d887806090 --- /dev/null +++ b/net/mptcp/token.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP token management + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org, + * authored by: + * + * Sébastien Barré <sebastien.barre@uclouvain.be> + * Christoph Paasch <christoph.paasch@uclouvain.be> + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> + * Gregory Detal <gregory.detal@uclouvain.be> + * Fabien Duchêne <fabien.duchene@uclouvain.be> + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> + * Lavkesh Lahngir <lavkesh51@gmail.com> + * Andreas Ripke <ripke@neclab.eu> + * Vlad Dogaru <vlad.dogaru@intel.com> + * Octavian Purdila <octavian.purdila@intel.com> + * John Ronan <jronan@tssg.org> + * Catalin Nicutar <catalin.nicutar@gmail.com> + * Brandon Heller <brandonh@stanford.edu> + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/radix-tree.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/protocol.h> +#include <net/mptcp.h> +#include "protocol.h" + +static RADIX_TREE(token_tree, GFP_ATOMIC); +static RADIX_TREE(token_req_tree, GFP_ATOMIC); +static DEFINE_SPINLOCK(token_tree_lock); +static int token_used __read_mostly; + +/** + * mptcp_token_new_request - create new key/idsn/token for subflow_request + * @req - the request socket + * + * This function is called when a new mptcp connection is coming in. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * Returns 0 on success. + */ +int mptcp_token_new_request(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + + err = radix_tree_insert(&token_req_tree, + subflow_req->token, &token_used); + spin_unlock_bh(&token_tree_lock); + return err; +} + +/** + * mptcp_token_new_connect - create new key/idsn/token for subflow + * @sk - the socket that will initiate a connection + * + * This function is called when a new outgoing mptcp connection is + * initiated. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * On success, the mptcp connection can be found again using + * the computed token at a later time, this is needed to process + * join requests. + * + * returns 0 on success. + */ +int mptcp_token_new_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *mptcp_sock = subflow->conn; + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); + + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + + token = subflow->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_new_accept - insert token for later processing + * @token: the token to insert to the tree + * + * Called when a SYN packet creates a new logical connection, i.e. + * is not a join request. + * + * We don't have an mptcp socket yet at that point. + * This is paired with mptcp_token_update_accept, called on accept(). + */ +int mptcp_token_new_accept(u32 token) +{ + int err; + + spin_lock_bh(&token_tree_lock); + err = radix_tree_insert(&token_tree, token, &token_used); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_update_accept - update token to map to mptcp socket + * @conn: the new struct mptcp_sock + * @sk: the initial subflow for this mptcp socket + * + * Called when the first mptcp socket is created on accept to + * refresh the dummy mapping (done to reserve the token) with + * the mptcp_socket structure that wasn't allocated before. + */ +void mptcp_token_update_accept(struct sock *sk, struct sock *conn) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + void __rcu **slot; + + spin_lock_bh(&token_tree_lock); + slot = radix_tree_lookup_slot(&token_tree, subflow->token); + WARN_ON_ONCE(!slot); + if (slot) { + WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used); + radix_tree_replace_slot(&token_tree, slot, conn); + } + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy_request - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove not-yet-fully-established incoming connection identified + * by @token. + */ +void mptcp_token_destroy_request(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_req_tree, token); + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove the connection identified by @token. + */ +void mptcp_token_destroy(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_tree, token); + spin_unlock_bh(&token_tree_lock); +} |