From 439be32656035d3239fd56f9b83353ec06cb3b45 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:23 -0600 Subject: proc/fd: In proc_fd_link use fget_task When discussing[1] exec and posix file locks it was realized that none of the callers of get_files_struct fundamentally needed to call get_files_struct, and that by switching them to helper functions instead it will both simplify their code and remove unnecessary increments of files_struct.count. Those unnecessary increments can result in exec unnecessarily unsharing files_struct which breaking posix locks, and it can result in fget_light having to fallback to fget reducing system performance. Simplifying proc_fd_link is a little bit tricky. It is necessary to know that there is a reference to fd_f ile while path_get is running. This reference can either be guaranteed to exist either by locking the fdtable as the code currently does or by taking a reference on the file in question. Use fget_task to remove the need for get_files_struct and to take a reference to file in question. [1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com Suggested-by: Oleg Nesterov v1: https://lkml.kernel.org/r/20200817220425.9389-8-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-6-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/proc/fd.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 81882a13212d..d58960f6ee52 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -146,29 +146,22 @@ static const struct dentry_operations tid_fd_dentry_operations = { static int proc_fd_link(struct dentry *dentry, struct path *path) { - struct files_struct *files = NULL; struct task_struct *task; int ret = -ENOENT; task = get_proc_task(d_inode(dentry)); if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - - if (files) { unsigned int fd = proc_fd(d_inode(dentry)); struct file *fd_file; - spin_lock(&files->file_lock); - fd_file = fcheck_files(files, fd); + fd_file = fget_task(task, fd); if (fd_file) { *path = fd_file->f_path; path_get(&fd_file->f_path); ret = 0; + fput(fd_file); } - spin_unlock(&files->file_lock); - put_files_struct(files); + put_task_struct(task); } return ret; -- cgit v1.2.3 From 120ce2b0cd52abe73e8b16c23461eb14df5a87d8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:25 -0600 Subject: file: Factor files_lookup_fd_locked out of fcheck_files To make it easy to tell where files->file_lock protection is being used when looking up a file create files_lookup_fd_locked. Only allow this function to be called with the file_lock held. Update the callers of fcheck and fcheck_files that are called with the files->file_lock held to call files_lookup_fd_locked instead. Hopefully this makes it easier to quickly understand what is going on. The need for better names became apparent in the last round of discussion of this set of changes[1]. [1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com Link: https://lkml.kernel.org/r/20201120231441.29911-8-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/file.c | 2 +- fs/locks.c | 14 ++++++++------ fs/proc/fd.c | 2 +- include/linux/fdtable.h | 7 +++++++ 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/file.c b/fs/file.c index b5591efb87f5..9d0e91168be1 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1098,7 +1098,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) spin_lock(&files->file_lock); err = expand_files(files, newfd); - file = fcheck(oldfd); + file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { diff --git a/fs/locks.c b/fs/locks.c index 1f84a03601fe..148197c1b547 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2539,14 +2539,15 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, */ if (!error && file_lock->fl_type != F_UNLCK && !(file_lock->fl_flags & FL_OFDLCK)) { + struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in * close(). rcu_read_lock() wouldn't do. */ - spin_lock(¤t->files->file_lock); - f = fcheck(fd); - spin_unlock(¤t->files->file_lock); + spin_lock(&files->file_lock); + f = files_lookup_fd_locked(files, fd); + spin_unlock(&files->file_lock); if (f != filp) { file_lock->fl_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); @@ -2670,14 +2671,15 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, */ if (!error && file_lock->fl_type != F_UNLCK && !(file_lock->fl_flags & FL_OFDLCK)) { + struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in * close(). rcu_read_lock() wouldn't do. */ - spin_lock(¤t->files->file_lock); - f = fcheck(fd); - spin_unlock(¤t->files->file_lock); + spin_lock(&files->file_lock); + f = files_lookup_fd_locked(files, fd); + spin_unlock(&files->file_lock); if (f != filp) { file_lock->fl_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index d58960f6ee52..2cca9bca3b3a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -35,7 +35,7 @@ static int seq_show(struct seq_file *m, void *v) unsigned int fd = proc_fd(m->private); spin_lock(&files->file_lock); - file = fcheck_files(files, fd); + file = files_lookup_fd_locked(files, fd); if (file) { struct fdtable *fdt = files_fdtable(files); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 639933f37da9..fda4b81dd735 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -91,6 +91,13 @@ static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsig return NULL; } +static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) +{ + RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock), + "suspicious rcu_dereference_check() usage"); + return files_lookup_fd_raw(files, fd); +} + static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!rcu_read_lock_held() && -- cgit v1.2.3 From f36c2943274199cb8aef32ac96531ffb7c4b43d0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:26 -0600 Subject: file: Replace fcheck_files with files_lookup_fd_rcu This change renames fcheck_files to files_lookup_fd_rcu. All of the remaining callers take the rcu_read_lock before calling this function so the _rcu suffix is appropriate. This change also tightens up the debug check to verify that all callers hold the rcu_read_lock. All callers that used to call files_check with the files->file_lock held have now been changed to call files_lookup_fd_locked. This change of name has helped remind me of which locks and which guarantees are in place helping me to catch bugs later in the patchset. The need for better names became apparent in the last round of discussion of this set of changes[1]. [1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com Link: https://lkml.kernel.org/r/20201120231441.29911-9-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- Documentation/filesystems/files.rst | 6 +++--- fs/file.c | 4 ++-- fs/proc/fd.c | 4 ++-- include/linux/fdtable.h | 7 +++---- kernel/bpf/task_iter.c | 2 +- kernel/kcmp.c | 2 +- 6 files changed, 12 insertions(+), 13 deletions(-) (limited to 'fs/proc') diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index cbf8e57376bf..ea75acdb632c 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -62,7 +62,7 @@ the fdtable structure - be held. 4. To look up the file structure given an fd, a reader - must use either fcheck() or fcheck_files() APIs. These + must use either fcheck() or files_lookup_fd_rcu() APIs. These take care of barrier requirements due to lock-free lookup. An example:: @@ -84,7 +84,7 @@ the fdtable structure - on ->f_count:: rcu_read_lock(); - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) { if (atomic_long_inc_not_zero(&file->f_count)) *fput_needed = 1; @@ -104,7 +104,7 @@ the fdtable structure - lock-free, they must be installed using rcu_assign_pointer() API. If they are looked up lock-free, rcu_dereference() must be used. However it is advisable to use files_fdtable() - and fcheck()/fcheck_files() which take care of these issues. + and fcheck()/files_lookup_fd_rcu() which take care of these issues. 7. While updating, the fdtable pointer must be looked up while holding files->file_lock. If ->file_lock is dropped, then diff --git a/fs/file.c b/fs/file.c index 9d0e91168be1..5861c4f89419 100644 --- a/fs/file.c +++ b/fs/file.c @@ -814,7 +814,7 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd, rcu_read_lock(); loop: - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) { /* File object ref couldn't be taken. * dup2() atomicity guarantee is the reason @@ -1127,7 +1127,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) int retval = oldfd; rcu_read_lock(); - if (!fcheck_files(files, oldfd)) + if (!files_lookup_fd_rcu(files, oldfd)) retval = -EBADF; rcu_read_unlock(); return retval; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 2cca9bca3b3a..3dec44d7c5c5 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -90,7 +90,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) return false; rcu_read_lock(); - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) *mode = file->f_mode; rcu_read_unlock(); @@ -243,7 +243,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, char name[10 + 1]; unsigned int len; - f = fcheck_files(files, fd); + f = files_lookup_fd_rcu(files, fd); if (!f) continue; data.mode = f->f_mode; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index fda4b81dd735..fa8c402a7790 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -98,10 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un return files_lookup_fd_raw(files, fd); } -static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) +static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && - !lockdep_is_held(&files->file_lock), + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious rcu_dereference_check() usage"); return files_lookup_fd_raw(files, fd); } @@ -109,7 +108,7 @@ static inline struct file *fcheck_files(struct files_struct *files, unsigned int /* * Check whether the specified fd has an open file. */ -#define fcheck(fd) fcheck_files(current->files, fd) +#define fcheck(fd) files_lookup_fd_rcu(current->files, fd) struct task_struct; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5b6af30bfbcd..5ab2ccfb96cb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -183,7 +183,7 @@ again: for (; curr_fd < max_fds; curr_fd++) { struct file *f; - f = fcheck_files(curr_files, curr_fd); + f = files_lookup_fd_rcu(curr_files, curr_fd); if (!f) continue; if (!get_file_rcu(f)) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 87c48c0104ad..990717c1aed3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -67,7 +67,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) rcu_read_lock(); if (task->files) - file = fcheck_files(task->files, idx); + file = files_lookup_fd_rcu(task->files, idx); rcu_read_unlock(); task_unlock(task); -- cgit v1.2.3 From 64eb661fda0269276b4c46965832938e3f268268 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:29 -0600 Subject: proc/fd: In tid_fd_mode use task_lookup_fd_rcu When discussing[1] exec and posix file locks it was realized that none of the callers of get_files_struct fundamentally needed to call get_files_struct, and that by switching them to helper functions instead it will both simplify their code and remove unnecessary increments of files_struct.count. Those unnecessary increments can result in exec unnecessarily unsharing files_struct which breaking posix locks, and it can result in fget_light having to fallback to fget reducing system performance. Instead of manually coding finding the files struct for a task and then calling files_lookup_fd_rcu, use the helper task_lookup_fd_rcu that combines those to steps. Making the code simpler and removing the need to get a reference on a files_struct. [1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com Suggested-by: Oleg Nesterov Acked-by: Christian Brauner v1: https://lkml.kernel.org/r/20200817220425.9389-7-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-12-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/proc/fd.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 3dec44d7c5c5..c1a984f3c4df 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -83,18 +83,13 @@ static const struct file_operations proc_fdinfo_file_operations = { static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { - struct files_struct *files = get_files_struct(task); struct file *file; - if (!files) - return false; - rcu_read_lock(); - file = files_lookup_fd_rcu(files, fd); + file = task_lookup_fd_rcu(task, fd); if (file) *mode = file->f_mode; rcu_read_unlock(); - put_files_struct(files); return !!file; } -- cgit v1.2.3 From 5b17b61870e2f4b0a4fdc5c6039fbdb4ffb796df Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:32 -0600 Subject: proc/fd: In proc_readfd_common use task_lookup_next_fd_rcu When discussing[1] exec and posix file locks it was realized that none of the callers of get_files_struct fundamentally needed to call get_files_struct, and that by switching them to helper functions instead it will both simplify their code and remove unnecessary increments of files_struct.count. Those unnecessary increments can result in exec unnecessarily unsharing files_struct which breaking posix locks, and it can result in fget_light having to fallback to fget reducing system performance. Using task_lookup_next_fd_rcu simplifies proc_readfd_common, by moving the checking for the maximum file descritor into the generic code, and by remvoing the need for capturing and releasing a reference on files_struct. As task_lookup_fd_rcu may update the fd ctx->pos has been changed to be the fd +2 after task_lookup_fd_rcu returns. [1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com Suggested-by: Oleg Nesterov Tested-by: Andy Lavr v1: https://lkml.kernel.org/r/20200817220425.9389-10-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-15-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/proc/fd.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index c1a984f3c4df..72c1525b4b3e 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -217,7 +217,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, instantiate_t instantiate) { struct task_struct *p = get_proc_task(file_inode(file)); - struct files_struct *files; unsigned int fd; if (!p) @@ -225,22 +224,18 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - files = get_files_struct(p); - if (!files) - goto out; rcu_read_lock(); - for (fd = ctx->pos - 2; - fd < files_fdtable(files)->max_fds; - fd++, ctx->pos++) { + for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = files_lookup_fd_rcu(files, fd); + f = task_lookup_next_fd_rcu(p, &fd); + ctx->pos = fd + 2LL; if (!f) - continue; + break; data.mode = f->f_mode; rcu_read_unlock(); data.fd = fd; @@ -249,13 +244,11 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out_fd_loop; + goto out; cond_resched(); rcu_read_lock(); } rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); out: put_task_struct(p); return 0; -- cgit v1.2.3 From 775e0656b27210ae668e33af00bece858f44576f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:34 -0600 Subject: proc/fd: In fdinfo seq_show don't use get_files_struct When discussing[1] exec and posix file locks it was realized that none of the callers of get_files_struct fundamentally needed to call get_files_struct, and that by switching them to helper functions instead it will both simplify their code and remove unnecessary increments of files_struct.count. Those unnecessary increments can result in exec unnecessarily unsharing files_struct which breaking posix locks, and it can result in fget_light having to fallback to fget reducing system performance. Instead hold task_lock for the duration that task->files needs to be stable in seq_show. The task_lock was already taken in get_files_struct, and so skipping get_files_struct performs less work overall, and avoids the problems with the files_struct reference count. [1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com Suggested-by: Oleg Nesterov Acked-by: Christian Brauner v1: https://lkml.kernel.org/r/20200817220425.9389-12-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-17-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/proc/fd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 72c1525b4b3e..cb51763ed554 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -28,9 +28,8 @@ static int seq_show(struct seq_file *m, void *v) if (!task) return -ENOENT; - files = get_files_struct(task); - put_task_struct(task); - + task_lock(task); + files = task->files; if (files) { unsigned int fd = proc_fd(m->private); @@ -47,8 +46,9 @@ static int seq_show(struct seq_file *m, void *v) ret = 0; } spin_unlock(&files->file_lock); - put_files_struct(files); } + task_unlock(task); + put_task_struct(task); if (ret) return ret; @@ -57,6 +57,7 @@ static int seq_show(struct seq_file *m, void *v) (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id); + /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; -- cgit v1.2.3