diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-04 15:12:02 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-04 15:12:02 -0700 |
commit | 4f30a60aa78410496e5ffe632a371c00f0d83a8d (patch) | |
tree | b2f74fcd6b286c961f310548de9b2cc855787849 /tools | |
parent | 74858abbb1032222f922487fd1a24513bbed80f9 (diff) | |
parent | a5161eeef97cb0cdc4de966005926db2f5894af4 (diff) | |
download | linux-4f30a60aa78410496e5ffe632a371c00f0d83a8d.tar.bz2 |
Merge tag 'close-range-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull close_range() implementation from Christian Brauner:
"This adds the close_range() syscall. It allows to efficiently close a
range of file descriptors up to all file descriptors of a calling
task.
This is coordinated with the FreeBSD folks which have copied our
version of this syscall and in the meantime have already merged it in
April 2019:
https://reviews.freebsd.org/D21627
https://svnweb.freebsd.org/base?view=revision&revision=359836
The syscall originally came up in a discussion around the new mount
API and making new file descriptor types cloexec by default. During
this discussion, Al suggested the close_range() syscall.
First, it helps to close all file descriptors of an exec()ing task.
This can be done safely via (quoting Al's example from [1] verbatim):
/* that exec is sensitive */
unshare(CLONE_FILES);
/* we don't want anything past stderr here */
close_range(3, ~0U);
execve(....);
The code snippet above is one way of working around the problem that
file descriptors are not cloexec by default. This is aggravated by the
fact that we can't just switch them over without massively regressing
userspace. For a whole class of programs having an in-kernel method of
closing all file descriptors is very helpful (e.g. demons, service
managers, programming language standard libraries, container managers
etc.).
Second, it allows userspace to avoid implementing closing all file
descriptors by parsing through /proc/<pid>/fd/* and calling close() on
each file descriptor and other hacks. From looking at various
large(ish) userspace code bases this or similar patterns are very
common in service managers, container runtimes, and programming
language runtimes/standard libraries such as Python or Rust.
In addition, the syscall will also work for tasks that do not have
procfs mounted and on kernels that do not have procfs support compiled
in. In such situations the only way to make sure that all file
descriptors are closed is to call close() on each file descriptor up
to UINT_MAX or RLIMIT_NOFILE, OPEN_MAX trickery.
Based on Linus' suggestion close_range() also comes with a new flag
CLOSE_RANGE_UNSHARE to more elegantly handle file descriptor dropping
right before exec. This would usually be expressed in the sequence:
unshare(CLONE_FILES);
close_range(3, ~0U);
as pointed out by Linus it might be desirable to have this be a part
of close_range() itself under a new flag CLOSE_RANGE_UNSHARE which
gets especially handy when we're closing all file descriptors above a
certain threshold.
Test-suite as always included"
* tag 'close-range-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
tests: add CLOSE_RANGE_UNSHARE tests
close_range: add CLOSE_RANGE_UNSHARE
tests: add close_range() tests
arch: wire-up close_range()
open: add close_range()
Diffstat (limited to 'tools')
-rw-r--r-- | tools/testing/selftests/Makefile | 1 | ||||
-rw-r--r-- | tools/testing/selftests/core/.gitignore | 1 | ||||
-rw-r--r-- | tools/testing/selftests/core/Makefile | 7 | ||||
-rw-r--r-- | tools/testing/selftests/core/close_range_test.c | 227 |
4 files changed, 236 insertions, 0 deletions
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 227ca78a5b7f..017ce2a7ae36 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -6,6 +6,7 @@ TARGETS += breakpoints TARGETS += capabilities TARGETS += cgroup TARGETS += clone3 +TARGETS += core TARGETS += cpufreq TARGETS += cpu-hotplug TARGETS += drivers/dma-buf diff --git a/tools/testing/selftests/core/.gitignore b/tools/testing/selftests/core/.gitignore new file mode 100644 index 000000000000..6e6712ce5817 --- /dev/null +++ b/tools/testing/selftests/core/.gitignore @@ -0,0 +1 @@ +close_range_test diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile new file mode 100644 index 000000000000..f6f2d6f473c6 --- /dev/null +++ b/tools/testing/selftests/core/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -g -I../../../../usr/include/ + +TEST_GEN_PROGS := close_range_test + +include ../lib.mk + diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c new file mode 100644 index 000000000000..c99b98b0d461 --- /dev/null +++ b/tools/testing/selftests/core/close_range_test.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <linux/kernel.h> +#include <limits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <unistd.h> + +#include "../kselftest_harness.h" +#include "../clone3/clone3_selftests.h" + +#ifndef __NR_close_range +#define __NR_close_range -1 +#endif + +#ifndef CLOSE_RANGE_UNSHARE +#define CLOSE_RANGE_UNSHARE (1U << 1) +#endif + +static inline int sys_close_range(unsigned int fd, unsigned int max_fd, + unsigned int flags) +{ + return syscall(__NR_close_range, fd, max_fd, flags); +} + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +TEST(close_range) +{ + int i, ret; + int open_fds[101]; + + for (i = 0; i < ARRAY_SIZE(open_fds); i++) { + int fd; + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + XFAIL(return, "Skipping test since /dev/null does not exist"); + } + + open_fds[i] = fd; + } + + EXPECT_EQ(-1, sys_close_range(open_fds[0], open_fds[100], -1)) { + if (errno == ENOSYS) + XFAIL(return, "close_range() syscall not supported"); + } + + EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0)); + + for (i = 0; i <= 50; i++) + EXPECT_EQ(-1, fcntl(open_fds[i], F_GETFL)); + + for (i = 51; i <= 100; i++) + EXPECT_GT(fcntl(open_fds[i], F_GETFL), -1); + + /* create a couple of gaps */ + close(57); + close(78); + close(81); + close(82); + close(84); + close(90); + + EXPECT_EQ(0, sys_close_range(open_fds[51], open_fds[92], 0)); + + for (i = 51; i <= 92; i++) + EXPECT_EQ(-1, fcntl(open_fds[i], F_GETFL)); + + for (i = 93; i <= 100; i++) + EXPECT_GT(fcntl(open_fds[i], F_GETFL), -1); + + /* test that the kernel caps and still closes all fds */ + EXPECT_EQ(0, sys_close_range(open_fds[93], open_fds[99], 0)); + + for (i = 93; i <= 99; i++) + EXPECT_EQ(-1, fcntl(open_fds[i], F_GETFL)); + + EXPECT_GT(fcntl(open_fds[i], F_GETFL), -1); + + EXPECT_EQ(0, sys_close_range(open_fds[100], open_fds[100], 0)); + + EXPECT_EQ(-1, fcntl(open_fds[100], F_GETFL)); +} + +TEST(close_range_unshare) +{ + int i, ret, status; + pid_t pid; + int open_fds[101]; + struct clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + for (i = 0; i < ARRAY_SIZE(open_fds); i++) { + int fd; + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + XFAIL(return, "Skipping test since /dev/null does not exist"); + } + + open_fds[i] = fd; + } + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = sys_close_range(open_fds[0], open_fds[50], + CLOSE_RANGE_UNSHARE); + if (ret) + exit(EXIT_FAILURE); + + for (i = 0; i <= 50; i++) + if (fcntl(open_fds[i], F_GETFL) != -1) + exit(EXIT_FAILURE); + + for (i = 51; i <= 100; i++) + if (fcntl(open_fds[i], F_GETFL) == -1) + exit(EXIT_FAILURE); + + /* create a couple of gaps */ + close(57); + close(78); + close(81); + close(82); + close(84); + close(90); + + ret = sys_close_range(open_fds[51], open_fds[92], + CLOSE_RANGE_UNSHARE); + if (ret) + exit(EXIT_FAILURE); + + for (i = 51; i <= 92; i++) + if (fcntl(open_fds[i], F_GETFL) != -1) + exit(EXIT_FAILURE); + + for (i = 93; i <= 100; i++) + if (fcntl(open_fds[i], F_GETFL) == -1) + exit(EXIT_FAILURE); + + /* test that the kernel caps and still closes all fds */ + ret = sys_close_range(open_fds[93], open_fds[99], + CLOSE_RANGE_UNSHARE); + if (ret) + exit(EXIT_FAILURE); + + for (i = 93; i <= 99; i++) + if (fcntl(open_fds[i], F_GETFL) != -1) + exit(EXIT_FAILURE); + + if (fcntl(open_fds[100], F_GETFL) == -1) + exit(EXIT_FAILURE); + + ret = sys_close_range(open_fds[100], open_fds[100], + CLOSE_RANGE_UNSHARE); + if (ret) + exit(EXIT_FAILURE); + + if (fcntl(open_fds[100], F_GETFL) != -1) + exit(EXIT_FAILURE); + + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(close_range_unshare_capped) +{ + int i, ret, status; + pid_t pid; + int open_fds[101]; + struct clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + for (i = 0; i < ARRAY_SIZE(open_fds); i++) { + int fd; + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + XFAIL(return, "Skipping test since /dev/null does not exist"); + } + + open_fds[i] = fd; + } + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = sys_close_range(open_fds[0], UINT_MAX, + CLOSE_RANGE_UNSHARE); + if (ret) + exit(EXIT_FAILURE); + + for (i = 0; i <= 100; i++) + if (fcntl(open_fds[i], F_GETFL) != -1) + exit(EXIT_FAILURE); + + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST_HARNESS_MAIN |