summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/bpf/bpf_struct_ops.c22
-rw-r--r--net/core/filter.c6
-rw-r--r--net/ipv4/bpf_tcp_ca.c41
3 files changed, 65 insertions, 4 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 70f6fd4fa305..d6731c32864e 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -28,6 +28,7 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
+ struct rcu_head rcu;
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
@@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata)
return refcount_inc_not_zero(&kvalue->refcnt);
}
+static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ bpf_map_put(&st_map->map);
+}
+
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
@@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata)
st_map = container_of(kvalue, struct bpf_struct_ops_map,
kvalue);
- bpf_map_put(&st_map->map);
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its map->refcnt may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * Thus, a rcu grace period is needed here.
+ */
+ call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
}
}
diff --git a/net/core/filter.c b/net/core/filter.c
index cfbd01167eb5..2e32cee2c469 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5051,6 +5051,12 @@ err_clear:
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
+ if (level == SOL_TCP && optname == TCP_CONGESTION) {
+ if (optlen >= sizeof("cdg") - 1 &&
+ !strncmp("cdg", optval, optlen))
+ return -ENOTSUPP;
+ }
+
return _bpf_setsockopt(sk, level, optname, optval, optlen);
}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 9e41eff4a685..0dcee9df1326 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -10,6 +10,9 @@
#include <net/tcp.h>
#include <net/bpf_sk_storage.h>
+/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, init),
offsetof(struct tcp_congestion_ops, release),
@@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
.arg2_type = ARG_ANYTHING,
};
+static u32 prog_ops_moff(const struct bpf_prog *prog)
+{
+ const struct btf_member *m;
+ const struct btf_type *t;
+ u32 midx;
+
+ midx = prog->expected_attach_type;
+ t = bpf_tcp_congestion_ops.type;
+ m = &btf_type_member(t)[midx];
+
+ return btf_member_bit_offset(t, m) / 8;
+}
+
static const struct bpf_func_proto *
bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
const struct bpf_prog *prog)
@@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ /* Does not allow release() to call setsockopt.
+ * release() is called when the current bpf-tcp-cc
+ * is retiring. It is not allowed to call
+ * setsockopt() to make further changes which
+ * may potentially allocate new resources.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ /* Since get/setsockopt is usually expected to
+ * be available together, disable getsockopt for
+ * release also to avoid usage surprise.
+ * The bpf-tcp-cc already has a more powerful way
+ * to read tcp_sock from the PTR_TO_BTF_ID.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
default:
return bpf_base_func_proto(func_id);
}
@@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata)
tcp_unregister_congestion_control(kdata);
}
-/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */
-extern struct bpf_struct_ops bpf_tcp_congestion_ops;
-
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,