summaryrefslogtreecommitdiffstats
path: root/net/mptcp/subflow.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-15 18:42:13 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-15 18:42:13 -0700
commit9ff9b0d392ea08090cd1780fb196f36dbb586529 (patch)
tree276a3a5c4525b84dee64eda30b423fc31bf94850 /net/mptcp/subflow.c
parent840e5bb326bbcb16ce82dd2416d2769de4839aea (diff)
parent105faa8742437c28815b2a3eb8314ebc5fd9288c (diff)
downloadlinux-9ff9b0d392ea08090cd1780fb196f36dbb586529.tar.bz2
Merge tag 'net-next-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: - Add redirect_neigh() BPF packet redirect helper, allowing to limit stack traversal in common container configs and improving TCP back-pressure. Daniel reports ~10Gbps => ~15Gbps single stream TCP performance gain. - Expand netlink policy support and improve policy export to user space. (Ge)netlink core performs request validation according to declared policies. Expand the expressiveness of those policies (min/max length and bitmasks). Allow dumping policies for particular commands. This is used for feature discovery by user space (instead of kernel version parsing or trial and error). - Support IGMPv3/MLDv2 multicast listener discovery protocols in bridge. - Allow more than 255 IPv4 multicast interfaces. - Add support for Type of Service (ToS) reflection in SYN/SYN-ACK packets of TCPv6. - In Multi-patch TCP (MPTCP) support concurrent transmission of data on multiple subflows in a load balancing scenario. Enhance advertising addresses via the RM_ADDR/ADD_ADDR options. - Support SMC-Dv2 version of SMC, which enables multi-subnet deployments. - Allow more calls to same peer in RxRPC. - Support two new Controller Area Network (CAN) protocols - CAN-FD and ISO 15765-2:2016. - Add xfrm/IPsec compat layer, solving the 32bit user space on 64bit kernel problem. - Add TC actions for implementing MPLS L2 VPNs. - Improve nexthop code - e.g. handle various corner cases when nexthop objects are removed from groups better, skip unnecessary notifications and make it easier to offload nexthops into HW by converting to a blocking notifier. - Support adding and consuming TCP header options by BPF programs, opening the doors for easy experimental and deployment-specific TCP option use. - Reorganize TCP congestion control (CC) initialization to simplify life of TCP CC implemented in BPF. - Add support for shipping BPF programs with the kernel and loading them early on boot via the User Mode Driver mechanism, hence reusing all the user space infra we have. - Support sleepable BPF programs, initially targeting LSM and tracing. - Add bpf_d_path() helper for returning full path for given 'struct path'. - Make bpf_tail_call compatible with bpf-to-bpf calls. - Allow BPF programs to call map_update_elem on sockmaps. - Add BPF Type Format (BTF) support for type and enum discovery, as well as support for using BTF within the kernel itself (current use is for pretty printing structures). - Support listing and getting information about bpf_links via the bpf syscall. - Enhance kernel interfaces around NIC firmware update. Allow specifying overwrite mask to control if settings etc. are reset during update; report expected max time operation may take to users; support firmware activation without machine reboot incl. limits of how much impact reset may have (e.g. dropping link or not). - Extend ethtool configuration interface to report IEEE-standard counters, to limit the need for per-vendor logic in user space. - Adopt or extend devlink use for debug, monitoring, fw update in many drivers (dsa loop, ice, ionic, sja1105, qed, mlxsw, mv88e6xxx, dpaa2-eth). - In mlxsw expose critical and emergency SFP module temperature alarms. Refactor port buffer handling to make the defaults more suitable and support setting these values explicitly via the DCBNL interface. - Add XDP support for Intel's igb driver. - Support offloading TC flower classification and filtering rules to mscc_ocelot switches. - Add PTP support for Marvell Octeontx2 and PP2.2 hardware, as well as fixed interval period pulse generator and one-step timestamping in dpaa-eth. - Add support for various auth offloads in WiFi APs, e.g. SAE (WPA3) offload. - Add Lynx PHY/PCS MDIO module, and convert various drivers which have this HW to use it. Convert mvpp2 to split PCS. - Support Marvell Prestera 98DX3255 24-port switch ASICs, as well as 7-port Mediatek MT7531 IP. - Add initial support for QCA6390 and IPQ6018 in ath11k WiFi driver, and wcn3680 support in wcn36xx. - Improve performance for packets which don't require much offloads on recent Mellanox NICs by 20% by making multiple packets share a descriptor entry. - Move chelsio inline crypto drivers (for TLS and IPsec) from the crypto subtree to drivers/net. Move MDIO drivers out of the phy directory. - Clean up a lot of W=1 warnings, reportedly the actively developed subsections of networking drivers should now build W=1 warning free. - Make sure drivers don't use in_interrupt() to dynamically adapt their code. Convert tasklets to use new tasklet_setup API (sadly this conversion is not yet complete). * tag 'net-next-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2583 commits) Revert "bpfilter: Fix build error with CONFIG_BPFILTER_UMH" net, sockmap: Don't call bpf_prog_put() on NULL pointer bpf, selftest: Fix flaky tcp_hdr_options test when adding addr to lo bpf, sockmap: Add locking annotations to iterator netfilter: nftables: allow re-computing sctp CRC-32C in 'payload' statements net: fix pos incrementment in ipv6_route_seq_next net/smc: fix invalid return code in smcd_new_buf_create() net/smc: fix valid DMBE buffer sizes net/smc: fix use-after-free of delayed events bpfilter: Fix build error with CONFIG_BPFILTER_UMH cxgb4/ch_ipsec: Replace the module name to ch_ipsec from chcr net: sched: Fix suspicious RCU usage while accessing tcf_tunnel_info bpf: Fix register equivalence tracking. rxrpc: Fix loss of final ack on shutdown rxrpc: Fix bundle counting for exclusive connections netfilter: restore NF_INET_NUMHOOKS ibmveth: Identify ingress large send packets. ibmveth: Switch order of ibmveth_helper calls. cxgb4: handle 4-tuple PEDIT to NAT mode translation selftests: Add VRF route leaking tests ...
Diffstat (limited to 'net/mptcp/subflow.c')
-rw-r--r--net/mptcp/subflow.c119
1 files changed, 55 insertions, 64 deletions
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 6f035af1c9d2..ac4a1fe3550b 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,6 +20,7 @@
#include <net/ip6_route.h>
#endif
#include <net/mptcp.h>
+#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
@@ -270,6 +271,19 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
return thmac == subflow->thmac;
}
+void mptcp_subflow_reset(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ tcp_set_state(ssk, TCP_CLOSE);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ tcp_done(ssk);
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
+ schedule_work(&mptcp_sk(sk)->work))
+ sock_hold(sk);
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -342,8 +356,7 @@ fallback:
return;
do_reset:
- tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
+ mptcp_subflow_reset(sk);
}
struct request_sock_ops mptcp_subflow_request_sock_ops;
@@ -434,7 +447,7 @@ static void mptcp_sock_destruct(struct sock *sk)
sock_orphan(sk);
}
- mptcp_token_destroy(mptcp_sk(sk));
+ mptcp_destroy_common(mptcp_sk(sk));
inet_sock_destruct(sk);
}
@@ -769,12 +782,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
if (!mpext->dsn64) {
map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
mpext->data_seq);
- subflow->use_64bit_ack = 0;
pr_debug("expanded seq=%llu", subflow->map_seq);
} else {
map_seq = mpext->data_seq;
- subflow->use_64bit_ack = 1;
}
+ WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
if (subflow->map_valid) {
/* Allow replacing only with an identical map */
@@ -817,16 +829,25 @@ validate_seq:
return MAPPING_OK;
}
-static int subflow_read_actor(read_descriptor_t *desc,
- struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
+ u64 limit)
{
- size_t copy_len = min(desc->count, len);
-
- desc->count -= copy_len;
-
- pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ u32 incr;
+
+ incr = limit >= skb->len ? skb->len + fin : limit;
+
+ pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
+ subflow->map_subflow_seq);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
+ tcp_sk(ssk)->copied_seq += incr;
+ if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
+ sk_eat_skb(ssk, skb);
+ if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
+ subflow->map_valid = 0;
+ if (incr)
+ tcp_cleanup_rbuf(ssk, incr);
}
static bool subflow_check_data_avail(struct sock *ssk)
@@ -838,13 +859,13 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (!skb_peek(&ssk->sk_receive_queue))
+ subflow->data_avail = 0;
if (subflow->data_avail)
return true;
msk = mptcp_sk(subflow->conn);
for (;;) {
- u32 map_remaining;
- size_t delta;
u64 ack_seq;
u64 old_ack;
@@ -862,6 +883,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
subflow->map_data_len = skb->len;
subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
subflow->ssn_offset;
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
return true;
}
@@ -889,42 +911,18 @@ static bool subflow_check_data_avail(struct sock *ssk)
ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
ack_seq);
- if (ack_seq == old_ack)
+ if (ack_seq == old_ack) {
+ subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
break;
+ } else if (after64(ack_seq, old_ack)) {
+ subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
+ break;
+ }
/* only accept in-sequence mapping. Old values are spurious
- * retransmission; we can hit "future" values on active backup
- * subflow switch, we relay on retransmissions to get
- * in-sequence data.
- * Cuncurrent subflows support will require subflow data
- * reordering
+ * retransmission
*/
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- if (before64(ack_seq, old_ack))
- delta = min_t(size_t, old_ack - ack_seq, map_remaining);
- else
- delta = min_t(size_t, ack_seq - old_ack, map_remaining);
-
- /* discard mapped data */
- pr_debug("discarding %zu bytes, current map len=%d", delta,
- map_remaining);
- if (delta) {
- read_descriptor_t desc = {
- .count = delta,
- };
- int ret;
-
- ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
- if (ret < 0) {
- ssk->sk_err = -ret;
- goto fatal;
- }
- if (ret < delta)
- return false;
- if (delta == map_remaining)
- subflow->map_valid = 0;
- }
+ mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
}
return true;
@@ -935,13 +933,13 @@ fatal:
ssk->sk_error_report(ssk);
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
+ subflow->data_avail = 0;
return false;
}
bool mptcp_subflow_data_available(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sk_buff *skb;
/* check if current mapping is still valid */
if (subflow->map_valid &&
@@ -954,15 +952,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
subflow->map_data_len);
}
- if (!subflow_check_data_avail(sk)) {
- subflow->data_avail = 0;
- return false;
- }
-
- skb = skb_peek(&sk->sk_receive_queue);
- subflow->data_avail = skb &&
- before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
- return subflow->data_avail;
+ return subflow_check_data_avail(sk);
}
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
@@ -1009,8 +999,10 @@ static void subflow_write_space(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- sk_stream_write_space(sk);
- if (sk_stream_is_writeable(sk)) {
+ if (!sk_stream_is_writeable(sk))
+ return;
+
+ if (sk_stream_is_writeable(parent)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
@@ -1069,8 +1061,7 @@ static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
#endif
}
-int __mptcp_subflow_connect(struct sock *sk, int ifindex,
- const struct mptcp_addr_info *loc,
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1115,7 +1106,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
if (loc->family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
- ssk->sk_bound_dev_if = ifindex;
+ ssk->sk_bound_dev_if = loc->ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
goto failed;
@@ -1127,7 +1118,7 @@ int __mptcp_subflow_connect(struct sock *sk, int ifindex,
subflow->local_id = local_id;
subflow->remote_id = remote_id;
subflow->request_join = 1;
- subflow->request_bkup = 1;
+ subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);