Merge branch 'bpf-cgroup-bind-connect'

Andrey Ignatov says: ==================== v2->v3: - rebase due to conflicts - fix ipv6=m build v1->v2: - support expected_attach_type at prog load time so that prog (incl. context accesses and calls to helpers) can be validated with regard to specific attach point it is supposed to be attached to. Later, at attach time, attach type is checked so that it must be same as at load time if it was provided - reworked hooks to rely on expected_attach_type, and reduced number of new prog types from 6 to just 1: BPF_PROG_TYPE_CGROUP_SOCK_ADDR - reused BPF_PROG_TYPE_CGROUP_SOCK for sys_bind post-hooks - add selftests for post-sys_bind hook For our container management we've been using complicated and fragile setup consisting of LD_PRELOAD wrapper intercepting bind and connect calls from all containerized applications. Unfortunately it doesn't work for apps that don't use glibc and changing all applications that run in the datacenter is not possible due to 3rd party code and libraries (despite being open source code) and sheer amount of legacy code that has to be rewritten (we're rewriting what we can in parallel) These applications are written without containers in mind and have builtin assumptions about network services. Like an application X expects to connect localhost:special_port and find service Y in there. To move application X and service Y into two different containers LD_PRELOAD approach is used to help one service connect to another without rewriting them. Moving these two applications into different L2 (netns) or L3 (vrf) network isolation scopes doesn't help to solve the problem, since applications need to see each other like they were running on the host without containers. So if app X and app Y would run in different netns something would need to punch a connectivity hole in those namespaces. That would be real layering violation (with corresponding network debugging pains), since clean l2, l3 abstraction would suddenly support something that breaks through the layers. Instead we used LD_PRELOAD (and now bpf programs) at bind/connect time to help applications discover and connect to each other. All applications are running in init_nens and there are no vrfs. After bind/connect the normal fib/neighbor core networking logic works as it should always do and the whole system is clean from network point of view and can be debugged with standard tools. We also considered resurrecting Hannes's afnetns work, but all hierarchical namespace abstraction don't work due to these builtin networking assumptions inside the apps. To run an application inside cgroup container that was not written with containers in mind we have to make an illusion of running in non-containerized environment. In some cases we remember the port and container id in the post-bind hook in a bpf map and when some other task in a different container is trying to connect to a service we need to know where this service is running. It can be remote and can be local. Both client and service may or may not be written with containers in mind and this sockaddr rewrite is providing connectivity and load balancing feature. BPF+cgroup looks to be the best solution for this problem. Hence we introduce 3 hooks: - at entry into sys_bind and sys_connect to let bpf prog look and modify 'struct sockaddr' provided by user space and fail bind/connect when appropriate - post sys_bind after port is allocated The approach works great and has zero overhead for anyone who doesn't use it and very low overhead when deployed. Different use case for this feature is to do low overhead firewall that doesn't need to inspect all packets and works at bind/connect time. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
author: Daniel Borkmann <daniel@iogearbox.net> 2018-03-31 02:17:57 +0200
committer: Daniel Borkmann <daniel@iogearbox.net> 2018-03-31 02:18:07 +0200
commit: 7828f20e3779e4e85e55371e0e43f5006a15fb41 (patch)
tree: 48f4977b0b8e69bd6432b18556ad9ac7ca7728eb /tools/testing/selftests/bpf/test_sock.c
parent: 807ae7daf5fb9ba9ef688344ae7c0d8cbebd211c (diff)
parent: 1d436885b23bf4474617914d7eb15e039c83ed99 (diff)
download: linux-7828f20e3779e4e85e55371e0e43f5006a15fb41.tar.bz2
1 files changed, 479 insertions, 0 deletions
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
new file mode 100644
index 000000000000..73bb20cfb9b7
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <linux/filter.h>
+
+#include <bpf/bpf.h>
+
+#include "cgroup_helpers.h"
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define CG_PATH		"/foo"
+#define MAX_INSNS	512
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sock_test {
+	const char *descr;
+	/* BPF prog properties */
+	struct bpf_insn	insns[MAX_INSNS];
+	enum bpf_attach_type expected_attach_type;
+	enum bpf_attach_type attach_type;
+	/* Socket properties */
+	int domain;
+	int type;
+	/* Endpoint to bind() to */
+	const char *ip;
+	unsigned short port;
+	/* Expected test result */
+	enum {
+		LOAD_REJECT,
+		ATTACH_REJECT,
+		BIND_REJECT,
+		SUCCESS,
+	} result;
+};
+
+static struct sock_test tests[] = {
+	{
+		"bind4 load with invalid access: src_ip6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip6[0])),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET4_POST_BIND,
+		BPF_CGROUP_INET4_POST_BIND,
+		0,
+		0,
+		NULL,
+		0,
+		LOAD_REJECT,
+	},
+	{
+		"bind4 load with invalid access: mark",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, mark)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET4_POST_BIND,
+		BPF_CGROUP_INET4_POST_BIND,
+		0,
+		0,
+		NULL,
+		0,
+		LOAD_REJECT,
+	},
+	{
+		"bind6 load with invalid access: src_ip4",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET6_POST_BIND,
+		BPF_CGROUP_INET6_POST_BIND,
+		0,
+		0,
+		NULL,
+		0,
+		LOAD_REJECT,
+	},
+	{
+		"sock_create load with invalid access: src_port",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET_SOCK_CREATE,
+		BPF_CGROUP_INET_SOCK_CREATE,
+		0,
+		0,
+		NULL,
+		0,
+		LOAD_REJECT,
+	},
+	{
+		"sock_create load w/o expected_attach_type (compat mode)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		0,
+		BPF_CGROUP_INET_SOCK_CREATE,
+		AF_INET,
+		SOCK_STREAM,
+		"127.0.0.1",
+		8097,
+		SUCCESS,
+	},
+	{
+		"sock_create load w/ expected_attach_type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET_SOCK_CREATE,
+		BPF_CGROUP_INET_SOCK_CREATE,
+		AF_INET,
+		SOCK_STREAM,
+		"127.0.0.1",
+		8097,
+		SUCCESS,
+	},
+	{
+		"attach type mismatch bind4 vs bind6",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET4_POST_BIND,
+		BPF_CGROUP_INET6_POST_BIND,
+		0,
+		0,
+		NULL,
+		0,
+		ATTACH_REJECT,
+	},
+	{
+		"attach type mismatch bind6 vs bind4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET6_POST_BIND,
+		BPF_CGROUP_INET4_POST_BIND,
+		0,
+		0,
+		NULL,
+		0,
+		ATTACH_REJECT,
+	},
+	{
+		"attach type mismatch default vs bind4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		0,
+		BPF_CGROUP_INET4_POST_BIND,
+		0,
+		0,
+		NULL,
+		0,
+		ATTACH_REJECT,
+	},
+	{
+		"attach type mismatch bind6 vs sock_create",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET6_POST_BIND,
+		BPF_CGROUP_INET_SOCK_CREATE,
+		0,
+		0,
+		NULL,
+		0,
+		ATTACH_REJECT,
+	},
+	{
+		"bind4 reject all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET4_POST_BIND,
+		BPF_CGROUP_INET4_POST_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		"0.0.0.0",
+		0,
+		BIND_REJECT,
+	},
+	{
+		"bind6 reject all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET6_POST_BIND,
+		BPF_CGROUP_INET6_POST_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		"::",
+		0,
+		BIND_REJECT,
+	},
+	{
+		"bind6 deny specific IP & port",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip6[3])),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x01000000, 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET6_POST_BIND,
+		BPF_CGROUP_INET6_POST_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		"::1",
+		8193,
+		BIND_REJECT,
+	},
+	{
+		"bind4 allow specific IP & port",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x0100007F, 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_JMP_A(1),
+
+			/* else return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET4_POST_BIND,
+		BPF_CGROUP_INET4_POST_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		"127.0.0.1",
+		4098,
+		SUCCESS,
+	},
+	{
+		"bind4 allow all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET4_POST_BIND,
+		BPF_CGROUP_INET4_POST_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		"0.0.0.0",
+		0,
+		SUCCESS,
+	},
+	{
+		"bind6 allow all",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		BPF_CGROUP_INET6_POST_BIND,
+		BPF_CGROUP_INET6_POST_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		"::",
+		0,
+		SUCCESS,
+	},
+};
+
+static size_t probe_prog_length(const struct bpf_insn *fp)
+{
+	size_t len;
+
+	for (len = MAX_INSNS - 1; len > 0; --len)
+		if (fp[len].code != 0 || fp[len].imm != 0)
+			break;
+	return len + 1;
+}
+
+static int load_sock_prog(const struct bpf_insn *prog,
+			  enum bpf_attach_type attach_type)
+{
+	struct bpf_load_program_attr attr;
+
+	memset(&attr, 0, sizeof(struct bpf_load_program_attr));
+	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
+	attr.expected_attach_type = attach_type;
+	attr.insns = prog;
+	attr.insns_cnt = probe_prog_length(attr.insns);
+	attr.license = "GPL";
+
+	return bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+}
+
+static int attach_sock_prog(int cgfd, int progfd,
+			    enum bpf_attach_type attach_type)
+{
+	return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE);
+}
+
+static int bind_sock(int domain, int type, const char *ip, unsigned short port)
+{
+	struct sockaddr_storage addr;
+	struct sockaddr_in6 *addr6;
+	struct sockaddr_in *addr4;
+	int sockfd = -1;
+	socklen_t len;
+	int err = 0;
+
+	sockfd = socket(domain, type, 0);
+	if (sockfd < 0)
+		goto err;
+
+	memset(&addr, 0, sizeof(addr));
+
+	if (domain == AF_INET) {
+		len = sizeof(struct sockaddr_in);
+		addr4 = (struct sockaddr_in *)&addr;
+		addr4->sin_family = domain;
+		addr4->sin_port = htons(port);
+		if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1)
+			goto err;
+	} else if (domain == AF_INET6) {
+		len = sizeof(struct sockaddr_in6);
+		addr6 = (struct sockaddr_in6 *)&addr;
+		addr6->sin6_family = domain;
+		addr6->sin6_port = htons(port);
+		if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1)
+			goto err;
+	} else {
+		goto err;
+	}
+
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1)
+		goto err;
+
+	goto out;
+err:
+	err = -1;
+out:
+	close(sockfd);
+	return err;
+}
+
+static int run_test_case(int cgfd, const struct sock_test *test)
+{
+	int progfd = -1;
+	int err = 0;
+
+	printf("Test case: %s .. ", test->descr);
+	progfd = load_sock_prog(test->insns, test->expected_attach_type);
+	if (progfd < 0) {
+		if (test->result == LOAD_REJECT)
+			goto out;
+		else
+			goto err;
+	}
+
+	if (attach_sock_prog(cgfd, progfd, test->attach_type) == -1) {
+		if (test->result == ATTACH_REJECT)
+			goto out;
+		else
+			goto err;
+	}
+
+	if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) {
+		/* sys_bind() may fail for different reasons, errno has to be
+		 * checked to confirm that BPF program rejected it.
+		 */
+		if (test->result == BIND_REJECT && errno == EPERM)
+			goto out;
+		else
+			goto err;
+	}
+
+
+	if (test->result != SUCCESS)
+		goto err;
+
+	goto out;
+err:
+	err = -1;
+out:
+	/* Detaching w/o checking return code: best effort attempt. */
+	if (progfd != -1)
+		bpf_prog_detach(cgfd, test->attach_type);
+	close(progfd);
+	printf("[%s]\n", err ? "FAIL" : "PASS");
+	return err;
+}
+
+static int run_tests(int cgfd)
+{
+	int passes = 0;
+	int fails = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+		if (run_test_case(cgfd, &tests[i]))
+			++fails;
+		else
+			++passes;
+	}
+	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+	return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+	int cgfd = -1;
+	int err = 0;
+
+	if (setup_cgroup_environment())
+		goto err;
+
+	cgfd = create_and_get_cgroup(CG_PATH);
+	if (!cgfd)
+		goto err;
+
+	if (join_cgroup(CG_PATH))
+		goto err;
+
+	if (run_tests(cgfd))
+		goto err;
+
+	goto out;
+err:
+	err = -1;
+out:
+	close(cgfd);
+	cleanup_cgroup_environment();
+	return err;
+}
author	Daniel Borkmann <daniel@iogearbox.net>	2018-03-31 02:17:57 +0200
committer	Daniel Borkmann <daniel@iogearbox.net>	2018-03-31 02:18:07 +0200
commit	7828f20e3779e4e85e55371e0e43f5006a15fb41 (patch)
tree	48f4977b0b8e69bd6432b18556ad9ac7ca7728eb /tools/testing/selftests/bpf/test_sock.c
parent	807ae7daf5fb9ba9ef688344ae7c0d8cbebd211c (diff)
parent	1d436885b23bf4474617914d7eb15e039c83ed99 (diff)
download	linux-7828f20e3779e4e85e55371e0e43f5006a15fb41.tar.bz2