summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2022-01-31 15:05:25 +0000
committerDavid S. Miller <davem@davemloft.net>2022-01-31 15:05:25 +0000
commit01b2a995156d11166da00ce254d59bd7f7cefb92 (patch)
tree7211b0a22e1a7011c1210fbfad9a0695f5a9bd93 /net
parent678dfd5280341d877ca646499bfdc82a3d8b4356 (diff)
parentcb6cd2cec799356e5e2f75a8591894599a6ad49d (diff)
downloadlinux-01b2a995156d11166da00ce254d59bd7f7cefb92.tar.bz2
Merge branch 'hash-rethink'
Akhmat Karakotov says: ==================== Make hash rethink configurable As it was shown in the report by Alexander Azimov, hash rethink at the client-side may lead to connection timeout toward stateful anycast services. Tom Herbert created a patchset to address this issue by applying hash rethink only after a negative routing event (3RTOs) [1]. This change also affects server-side behavior, which we found undesirable. This patchset changes defaults in a way to make them safe: hash rethink at the client-side is disabled and enabled at the server-side upon each RTO event or in case of duplicate acknowledgments. This patchset provides two options to change default behaviour. The hash rethink may be disabled at the server-side by the new sysctl option. Changes in the sysctl option don't affect default behavior at the client-side. Hash rethink can also be enabled/disabled with socket option or bpf syscalls which ovewrite both default and sysctl settings. This socket option is available on both client and server-side. This should provide mechanics to enable hash rethink inside administrative domain, such as DC, where hash rethink at the client-side can be desirable. [1] https://lore.kernel.org/netdev/20210809185314.38187-1-tom@herbertland.com/ v2: - Changed sysctl default to ENABLED in all patches. Reduced sysctl and socket option size to u8. Fixed netns bug reported by kernel test robot. v3: - Fixed bug with bad u8 comparison. Moved sk_txrehash to use less bytes in struct. Added WRITE_ONCE() in setsockopt in and READ_ONCE() in tcp_rtx_synack. v4: - Rebase and add documentation for sysctl option. v5: - Move sk_txrehash out of busy poll ifdef. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/filter.c10
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/sock.c14
-rw-r--r--net/core/sysctl_net_core.c14
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/tcp_output.c4
6 files changed, 44 insertions, 3 deletions
diff --git a/net/core/filter.c b/net/core/filter.c
index a06931c27eeb..9615ae1ab530 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5091,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_txrehash = (u8)val;
+ break;
default:
ret = -EINVAL;
}
@@ -5269,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
*((int *)optval) = sk->sk_reuseport;
break;
+ case SO_TXREHASH:
+ *((int *)optval) = sk->sk_txrehash;
+ break;
default:
goto err_clear;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c53d9aab38ab..8711350085d6 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -364,6 +364,8 @@ out_undo:
static int __net_init net_defaults_init_net(struct net *net)
{
net->core.sysctl_somaxconn = SOMAXCONN;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+
return 0;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index cccf21f3618d..d6804685f17f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1447,6 +1447,15 @@ set_sndbuf:
break;
}
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Paired with READ_ONCE() in tcp_rtx_synack() */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_reserved_mem;
break;
+ case SO_TXREHASH:
+ v.val = sk->sk_txrehash;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -3279,6 +3292,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
+ sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7b4d485aac7a..dbeb8ecbcd98 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
+ {
+ .procname = "txrehash",
+ .data = &init_net.core.sysctl_txrehash,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dou8vec_minmax,
+ },
{ }
};
@@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
static __net_init int sysctl_core_net_init(struct net *net)
{
- struct ctl_table *tbl;
+ struct ctl_table *tbl, *tmp;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
@@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
if (tbl == NULL)
goto err_dup;
- tbl[0].data = &net->core.sysctl_somaxconn;
+ for (tmp = tbl; tmp->procname; tmp++)
+ tmp->data += (char *)net - (char *)&init_net;
/* Don't export any sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fc2a985f6064..b81fb13fc5f4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1046,6 +1046,9 @@ int inet_csk_listen_start(struct sock *sk)
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
+ if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
+ sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 11c06b9db801..e76bf1e9251e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4092,7 +4092,9 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl;
int res;
- tcp_rsk(req)->txhash = net_tx_rndhash();
+ /* Paired with WRITE_ONCE() in sock_setsockopt() */
+ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+ tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL);
if (!res) {