summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2022-02-11 11:14:58 +0000
committerDavid S. Miller <davem@davemloft.net>2022-02-11 11:14:58 +0000
commit1ea59b5e1ae34ddc47248d18d3bd252cd9854952 (patch)
tree3d90bd3ecb81a8a37387bc31fbcbf6e064142ebf
parent429c83c78ab213d9656721cf46ae05366098485c (diff)
parentf9496b7c1b48ce02cd17a3ee88b1e049c689a222 (diff)
downloadlinux-1ea59b5e1ae34ddc47248d18d3bd252cd9854952.tar.bz2
Merge branch 'smc-optimizations'
D. Wythe says: ==================== net/smc: Optimizing performance in short-lived scenarios This patch set aims to optimizing performance of SMC in short-lived links scenarios, which is quite unsatisfactory right now. In our benchmark, we test it with follow scripts: ./wrk -c 10000 -t 4 -H 'Connection: Close' -d 20 http://smc-server Current performance figures like that: Running 20s test @ http://11.213.45.6 4 threads and 10000 connections 4956 requests in 20.06s, 3.24MB read Socket errors: connect 0, read 0, write 672, timeout 0 Requests/sec: 247.07 Transfer/sec: 165.28KB There are many reasons for this phenomenon, this patch set doesn't solve it all though, but it can be well alleviated with it in. Patch 1/5 (Make smc_tcp_listen_work() independent) : Separate smc_tcp_listen_work() from smc_listen_work(), make them independent of each other, the busy SMC handshake can not affect new TCP connections visit any more. Avoid discarding a large number of TCP connections after being overstock, which is undoubtedly raise the connection establishment time. Patch 2/5 (Limit SMC backlog connections): Since patch 1 has separated smc_tcp_listen_work() from smc_listen_work(), an unrestricted TCP accept have come into being. This patch try to put a limit on SMC backlog connections refers to implementation of TCP. Patch 3/5 (Limit SMC visits when handshake workqueue congested): Considering the complexity of SMC handshake right now, in short-lived links scenarios, this may not be the main scenario of SMC though, it's performance is still quite poor. This patch try to provide constraint on SMC handshake when handshake workqueue congested, which is the sign of SMC handshake stacking in our opinion. Patch 4/5 (Dynamic control handshake limitation by socket options) This patch allow applications dynamically control the ability of SMC handshake limitation. Since SMC don't support set SMC socket option before, this patch also have to support SMC's owns socket options. Patch 5/5 (Add global configure for handshake limitation by netlink) This patch provides a way to get benefit of handshake limitation without modifying any code for applications, which is quite useful for most existing applications. After this patch set, performance figures like that: Running 20s test @ http://11.213.45.6 4 threads and 10000 connections 693253 requests in 20.10s, 452.88MB read Requests/sec: 34488.13 Transfer/sec: 22.53MB That's a quite well performance improvement, about to 6 to 7 times in my environment. --- changelog: v1 -> v2: - fix compile warning - fix invalid dependencies in kconfig v2 -> v3: - correct spelling mistakes - fix useless variable declare v3 -> v4 - make smc_tcp_ls_wq be static v4 -> v5 - add dynamic control for SMC auto fallback by socket options - add global configure for SMC auto fallback through netlink v5 -> v6 - move auto fallback to net namespace scope - remove auto fallback attribute in SMC_GEN_SYS_INFO - add independent attributes for auto fallback v6 -> v7 - fix wording and the naming issues, rename 'auto fallback' to handshake limitation. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/socket.h1
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/netns/smc.h2
-rw-r--r--include/uapi/linux/smc.h15
-rw-r--r--net/ipv4/tcp_input.c3
-rw-r--r--net/smc/af_smc.c184
-rw-r--r--net/smc/smc.h13
-rw-r--r--net/smc/smc_netlink.c15
-rw-r--r--net/smc/smc_pnet.c3
9 files changed, 233 insertions, 4 deletions
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8ef26d89ef49..6f85f5d957ef 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -366,6 +366,7 @@ struct ucred {
#define SOL_XDP 283
#define SOL_MPTCP 284
#define SOL_MCTP 285
+#define SOL_SMC 286
/* IPX options */
#define IPX_TYPE 1
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 78b91bb92f0d..1168302b7927 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -394,6 +394,7 @@ struct tcp_sock {
bool is_mptcp;
#endif
#if IS_ENABLED(CONFIG_SMC)
+ bool (*smc_hs_congested)(const struct sock *sk);
bool syn_smc; /* SYN includes SMC */
#endif
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index ea8a9cf2619b..47b166684fd8 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -12,5 +12,7 @@ struct netns_smc {
/* protect fback_rsn */
struct mutex mutex_fback_rsn;
struct smc_stats_rsn *fback_rsn;
+
+ bool limit_smc_hs; /* constraint on handshake */
};
#endif
diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 6c2874fd2c00..693f549f6966 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -59,6 +59,9 @@ enum {
SMC_NETLINK_DUMP_SEID,
SMC_NETLINK_ENABLE_SEID,
SMC_NETLINK_DISABLE_SEID,
+ SMC_NETLINK_DUMP_HS_LIMITATION,
+ SMC_NETLINK_ENABLE_HS_LIMITATION,
+ SMC_NETLINK_DISABLE_HS_LIMITATION,
};
/* SMC_GENL_FAMILY top level attributes */
@@ -284,4 +287,16 @@ enum {
__SMC_NLA_SEID_TABLE_MAX,
SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1
};
+
+/* SMC_NETLINK_HS_LIMITATION attributes */
+enum {
+ SMC_NLA_HS_LIMITATION_UNSPEC,
+ SMC_NLA_HS_LIMITATION_ENABLED, /* u8 */
+ __SMC_NLA_HS_LIMITATION_MAX,
+ SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1
+};
+
+/* SMC socket options */
+#define SMC_LIMIT_HS 1 /* constraint on smc handshake */
+
#endif /* _UAPI_LINUX_SMC_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index af94a6d22a9d..92e65d56dc2c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6703,7 +6703,8 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
- ireq->smc_ok = rx_opt->smc_ok;
+ ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+ tcp_sk(sk)->smc_hs_congested(sk));
#endif
}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 00b2e9deabb0..246c874de629 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -59,12 +59,52 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
+static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
struct workqueue_struct *smc_close_wq; /* wq for close work */
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ void *hdr;
+
+ if (cb_ctx->pos[0])
+ goto out;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_HS_LIMITATION);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
+ sock_net(skb->sk)->smc.limit_smc_hs))
+ goto err;
+
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0] = 1;
+out:
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = true;
+ return 0;
+}
+
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = false;
+ return 0;
+}
+
static void smc_set_keepalive(struct sock *sk, int val)
{
struct smc_sock *smc = smc_sk(sk);
@@ -72,6 +112,51 @@ static void smc_set_keepalive(struct sock *sk, int val)
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}
+static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
+ sk->sk_max_ack_backlog)
+ goto drop;
+
+ if (sk_acceptq_is_full(&smc->sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ /* passthrough to original syn recv sock fct */
+ return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
+ own_req);
+
+drop:
+ dst_release(dst);
+ tcp_listendrop(sk);
+ return NULL;
+}
+
+static bool smc_hs_congested(const struct sock *sk)
+{
+ const struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (!smc)
+ return true;
+
+ if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+ return true;
+
+ return false;
+}
+
static struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};
@@ -1594,6 +1679,9 @@ static void smc_listen_out(struct smc_sock *new_smc)
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_dec(&lsmc->queued_smc_hs);
+
if (lsmc->sk.sk_state == SMC_LISTEN) {
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
@@ -2199,6 +2287,9 @@ static void smc_tcp_listen_work(struct work_struct *work)
if (!new_smc)
continue;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_inc(&lsmc->queued_smc_hs);
+
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
new_smc->fallback_rsn = lsmc->fallback_rsn;
@@ -2227,7 +2318,7 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
lsmc->clcsk_data_ready(listen_clcsock);
if (lsmc->sk.sk_state == SMC_LISTEN) {
sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
- if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
sock_put(&lsmc->sk);
}
}
@@ -2265,6 +2356,18 @@ static int smc_listen(struct socket *sock, int backlog)
smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
smc->clcsock->sk->sk_user_data =
(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+
+ /* save original ops */
+ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
+
+ smc->af_ops = *smc->ori_af_ops;
+ smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
+
+ inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
+
+ if (smc->limit_smc_hs)
+ tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+
rc = kernel_listen(smc->clcsock, backlog);
if (rc) {
smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
@@ -2558,6 +2661,67 @@ out:
return rc ? rc : rc1;
}
+static int __smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+ int val, len;
+
+ smc = smc_sk(sock->sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ val = smc->limit_smc_hs;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ smc = smc_sk(sk);
+
+ lock_sock(sk);
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+
+ smc->limit_smc_hs = !!val;
+ rc = 0;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
static int smc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -2567,6 +2731,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
if (level == SOL_TCP && optname == TCP_ULP)
return -EOPNOTSUPP;
+ else if (level == SOL_SMC)
+ return __smc_setsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sk);
@@ -2649,6 +2815,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
struct smc_sock *smc;
int rc;
+ if (level == SOL_SMC)
+ return __smc_getsockopt(sock, level, optname, optval, optlen);
+
smc = smc_sk(sock->sk);
mutex_lock(&smc->clcsock_release_lock);
if (!smc->clcsock) {
@@ -2877,6 +3046,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol,
smc->use_fallback = false; /* assume rdma capability first */
smc->fallback_rsn = 0;
+ /* default behavior from limit_smc_hs in every net namespace */
+ smc->limit_smc_hs = net->smc.limit_smc_hs;
+
rc = 0;
if (!clcsock) {
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
@@ -3024,9 +3196,14 @@ static int __init smc_init(void)
goto out_nl;
rc = -ENOMEM;
+
+ smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
+ if (!smc_tcp_ls_wq)
+ goto out_pnet;
+
smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
if (!smc_hs_wq)
- goto out_pnet;
+ goto out_alloc_tcp_ls_wq;
smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
if (!smc_close_wq)
@@ -3097,6 +3274,8 @@ out_alloc_wqs:
destroy_workqueue(smc_close_wq);
out_alloc_hs_wq:
destroy_workqueue(smc_hs_wq);
+out_alloc_tcp_ls_wq:
+ destroy_workqueue(smc_tcp_ls_wq);
out_pnet:
smc_pnet_exit();
out_nl:
@@ -3115,6 +3294,7 @@ static void __exit smc_exit(void)
smc_core_exit();
smc_ib_unregister_client();
destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_tcp_ls_wq);
destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 37b2001a0255..a096d8af21a0 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -14,6 +14,7 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/compiler.h> /* __aligned */
+#include <net/genetlink.h>
#include <net/sock.h>
#include "smc_ib.h"
@@ -249,9 +250,14 @@ struct smc_sock { /* smc sock container */
struct work_struct smc_listen_work;/* prepare new accept socket */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
+ bool limit_smc_hs; /* put constraint on handshake */
bool use_fallback; /* fallback to tcp */
int fallback_rsn; /* reason for fallback */
u32 peer_diagnosis; /* decline reason from peer */
+ atomic_t queued_smc_hs; /* queued smc handshakes */
+ struct inet_connection_sock_af_ops af_ops;
+ const struct inet_connection_sock_af_ops *ori_af_ops;
+ /* original af ops */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
@@ -276,7 +282,7 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
return (struct smc_sock *)sk;
}
-static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk)
+static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
{
return (struct smc_sock *)
((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
@@ -331,4 +337,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr,
struct smc_gidlist *gidlist,
struct smc_ib_device *known_dev, u8 *known_gid);
+/* smc handshake limitation interface for netlink */
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+
#endif /* __SMC_H */
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
index f13ab0661ed5..c5a62f6f52ba 100644
--- a/net/smc/smc_netlink.c
+++ b/net/smc/smc_netlink.c
@@ -111,6 +111,21 @@ static const struct genl_ops smc_gen_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.doit = smc_nl_disable_seid,
},
+ {
+ .cmd = SMC_NETLINK_DUMP_HS_LIMITATION,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_hs_limitation,
+ },
};
static const struct nla_policy smc_gen_nl_policy[2] = {
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 0599246c0376..ff61b7b95875 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -870,6 +870,9 @@ int smc_pnet_net_init(struct net *net)
smc_pnet_create_pnetids_list(net);
+ /* disable handshake limitation by default */
+ net->smc.limit_smc_hs = 0;
+
return 0;
}