diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-04 14:40:07 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-04 14:40:07 -0700 |
commit | 0a72761b27fe3b10e3f336bf2f2aa22635504cdd (patch) | |
tree | fbd99b0eac76464388070ef29153b9c8d596dc5f /tools | |
parent | 3950e975431bc914f7e81b8f2a2dbdf2064acb0f (diff) | |
parent | 55d9ad97e417cc2604654913e902d26f942bde00 (diff) | |
download | linux-0a72761b27fe3b10e3f336bf2f2aa22635504cdd.tar.bz2 |
Merge tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull thread updates from Christian Brauner:
"This contains the changes to add the missing support for attaching to
time namespaces via pidfds.
Last cycle setns() was changed to support attaching to multiple
namespaces atomically. This requires all namespaces to have a point of
no return where they can't fail anymore.
Specifically, <namespace-type>_install() is allowed to perform
permission checks and install the namespace into the new struct nsset
that it has been given but it is not allowed to make visible changes
to the affected task. Once <namespace-type>_install() returns,
anything that the given namespace type additionally requires to be
setup needs to ideally be done in a function that can't fail or if it
fails the failure must be non-fatal.
For time namespaces the relevant functions that fell into this
category were timens_set_vvar_page() and vdso_join_timens(). The
latter could still fail although it didn't need to. This function is
only implemented for vdso_join_timens() in current mainline. As
discussed on-list (cf. [1]), in order to make setns() support time
namespaces when attaching to multiple namespaces at once properly we
changed vdso_join_timens() to always succeed. So vdso_join_timens()
replaces the mmap_write_lock_killable() with mmap_read_lock().
Please note that arm is about to grow vdso support for time namespaces
(possibly this merge window). We've synced on this change and arm64
also uses mmap_read_lock(), i.e. makes vdso_join_timens() a function
that can't fail. Once the changes here and the arm64 changes have
landed, vdso_join_timens() should be turned into a void function so
it's obvious to callers and implementers on other architectures that
the expectation is that it can't fail.
We didn't do this right away because it would've introduced
unnecessary merge conflicts between the two trees for no major gain.
As always, tests included"
[1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein
* tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
tests: add CLONE_NEWTIME setns tests
nsproxy: support CLONE_NEWTIME with setns()
timens: add timens_commit() helper
timens: make vdso_join_timens() always succeed
Diffstat (limited to 'tools')
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd.h | 4 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd_setns_test.c | 76 |
2 files changed, 80 insertions, 0 deletions
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 8d728eda783d..a2c80914e3dc 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -22,6 +22,10 @@ #define P_PIDFD 3 #endif +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080 +#endif + #ifndef CLONE_PIDFD #define CLONE_PIDFD 0x00001000 #endif diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 9418108eae13..7a27dbe07e23 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -32,6 +32,7 @@ enum { PIDFD_NS_NET, PIDFD_NS_CGROUP, PIDFD_NS_PIDCLD, + PIDFD_NS_TIME, PIDFD_NS_MAX }; @@ -47,6 +48,7 @@ const struct ns_info { [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, + [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, }, }; FIXTURE(current_nsset) @@ -83,9 +85,49 @@ pid_t create_child(int *pidfd, unsigned flags) return sys_clone3(&args, sizeof(struct clone_args)); } +static bool switch_timens(void) +{ + int fd, ret; + + if (unshare(CLONE_NEWTIME)) + return false; + + fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC); + if (fd < 0) + return false; + + ret = setns(fd, CLONE_NEWTIME); + close(fd); + return ret == 0; +} + +static ssize_t read_nointr(int fd, void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = read(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +static ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = write(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + FIXTURE_SETUP(current_nsset) { int i, proc_fd, ret; + int ipc_sockets[2]; + char c; for (i = 0; i < PIDFD_NS_MAX; i++) { self->nsfds[i] = -EBADF; @@ -130,6 +172,9 @@ FIXTURE_SETUP(current_nsset) TH_LOG("%m - Failed to open pidfd for process %d", self->pid); } + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + /* Create tasks that will be stopped. */ self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER | CLONE_NEWNS | @@ -139,10 +184,27 @@ FIXTURE_SETUP(current_nsset) EXPECT_GE(self->child_pid1, 0); if (self->child_pid1 == 0) { + close(ipc_sockets[0]); + + if (!switch_timens()) + _exit(EXIT_FAILURE); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + pause(); _exit(EXIT_SUCCESS); } + close(ipc_sockets[1]); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + close(ipc_sockets[0]); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWCGROUP | CLONE_NEWIPC | @@ -151,10 +213,24 @@ FIXTURE_SETUP(current_nsset) EXPECT_GE(self->child_pid2, 0); if (self->child_pid2 == 0) { + close(ipc_sockets[0]); + + if (!switch_timens()) + _exit(EXIT_FAILURE); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + pause(); _exit(EXIT_SUCCESS); } + close(ipc_sockets[1]); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + close(ipc_sockets[0]); + for (i = 0; i < PIDFD_NS_MAX; i++) { char p[100]; |