diff options
39 files changed, 234 insertions, 96 deletions
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 36d42da7466a..5ddd128d4b7a 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -477,3 +477,4 @@ # 545 reserved for clone3 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd +549 common faccessat2 sys_faccessat2 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 4d1cf74a2caa..d5cae5ffede0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -451,3 +451,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 803039d504de..3b859596840d 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 439 +#define __NR_compat_syscalls 440 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index c1c61635f89c..6d95d0c8bf2f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -883,6 +883,8 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 042911e670b8..49e325b604b3 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -358,3 +358,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f4f49fcb76d0..f71b1bbcc198 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 435 common clone3 __sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 4c67b11f9c9e..edacc4561f2b 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 1f9e8ad636cc..f777141f5256 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -376,3 +376,4 @@ 435 n32 clone3 __sys_clone3 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd +439 n32 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c0b9d802dbf6..da8c76394e17 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -352,3 +352,4 @@ 435 n64 clone3 __sys_clone3 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd +439 n64 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index ac586774c980..13280625d312 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -425,3 +425,4 @@ 435 o32 clone3 __sys_clone3 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd +439 o32 faccessat2 sys_faccessat2 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 52a15f5cd130..5a758fa6ec52 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -435,3 +435,4 @@ 435 common clone3 sys_clone3_wrapper 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 220ae11555f2..f833a3190822 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -527,3 +527,4 @@ 435 spu clone3 sys_ni_syscall 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bd7bd3581a0f..bfdcb7633957 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 435 common clone3 sys_clone3 sys_clone3 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c7a30fcd135f..acc35daa1b79 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index f13615ecdecc..8004a276cb74 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -483,3 +483,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..d8f8a1a69ed1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -442,3 +442,4 @@ 435 i386 clone3 sys_clone3 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..78847b32e137 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -359,6 +359,7 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 85a9ab1bc04d..69d0d73876b3 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -408,3 +408,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 @@ -176,6 +176,7 @@ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; + struct cred *creds; }; struct poll_iocb { @@ -1589,8 +1590,11 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); + const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + revert_creds(old_cred); + put_cred(iocb->fsync.creds); iocb_put(iocb); } @@ -1604,6 +1608,10 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, if (unlikely(!req->file->f_op->fsync)) return -EINVAL; + req->creds = prepare_creds(); + if (!req->creds) + return -ENOMEM; + req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); diff --git a/fs/char_dev.c b/fs/char_dev.c index c5e6eff5a381..ba0ded7842a7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -483,6 +483,9 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) p->dev = dev; p->count = count; + if (WARN_ON(dev == WHITEOUT_DEV)) + return -EBUSY; + error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) diff --git a/fs/fs_context.c b/fs/fs_context.c index fc9f6ef93b55..7d5c5dd2b1d5 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -42,7 +42,6 @@ static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, { "mand", SB_MANDLOCK }, - { "posixacl", SB_POSIXACL }, { "ro", SB_RDONLY }, { "sync", SB_SYNCHRONOUS }, { }, @@ -53,44 +52,15 @@ static const struct constant_table common_clear_sb_flag[] = { { "nolazytime", SB_LAZYTIME }, { "nomand", SB_MANDLOCK }, { "rw", SB_RDONLY }, - { "silent", SB_SILENT }, { }, }; -static const char *const forbidden_sb_flag[] = { - "bind", - "dev", - "exec", - "move", - "noatime", - "nodev", - "nodiratime", - "noexec", - "norelatime", - "nostrictatime", - "nosuid", - "private", - "rec", - "relatime", - "remount", - "shared", - "slave", - "strictatime", - "suid", - "unbindable", -}; - /* * Check for a common mount option that manipulates s_flags. */ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) { unsigned int token; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) - if (strcmp(key, forbidden_sb_flag[i]) == 0) - return -EINVAL; token = lookup_constant(common_set_sb_flag, key, 0); if (token) { diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..0d467e32dd7e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,7 +126,6 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); diff --git a/fs/mount.h b/fs/mount.h index 711a4093e475..c7abb7b394d8 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -9,7 +9,13 @@ struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; + /* + * Traversal and modification of .list is protected by either + * - taking namespace_sem for write, OR + * - taking namespace_sem for read AND taking .ns_lock. + */ struct list_head list; + spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ @@ -133,9 +139,7 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - void *cached_mount; - u64 cached_event; - loff_t cached_index; + struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -153,3 +157,5 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } + +extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/namei.c b/fs/namei.c index a320371899cf..d81f73ff1a8b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3505,12 +3505,14 @@ EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { + bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(dir, dentry); if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && + !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -4345,9 +4347,6 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, (flags & RENAME_EXCHANGE)) return -EINVAL; - if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) - return -EPERM; - if (flags & RENAME_EXCHANGE) target_flags = 0; @@ -4483,20 +4482,6 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_whiteout(struct inode *dir, struct dentry *dentry) -{ - int error = may_create(dir, dentry); - if (error) - return error; - - if (!dir->i_op->mknod) - return -EPERM; - - return dir->i_op->mknod(dir, dentry, - S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); -} -EXPORT_SYMBOL(vfs_whiteout); - int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/fs/namespace.c b/fs/namespace.c index 5f036dc711b6..a6baee3c7904 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,6 +648,21 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -673,11 +688,15 @@ bool __is_local_mountpoint(struct dentry *dentry) goto out; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } + unlock_ns_list(ns); up_read(&namespace_sem); out: return is_covered; @@ -1245,46 +1264,71 @@ struct vfsmount *mnt_clone_internal(const struct path *path) } #ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct list_head *prev; down_read(&namespace_sem); - if (p->cached_event == p->ns->event) { - void *v = p->cached_mount; - if (*pos == p->cached_index) - return v; - if (*pos == p->cached_index + 1) { - v = seq_list_next(v, &p->ns->list, &p->cached_index); - return p->cached_mount = v; - } + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; } - p->cached_event = p->ns->event; - p->cached_mount = seq_list_start(&p->ns->list, *pos); - p->cached_index = *pos; - return p->cached_mount; + return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt = v; - p->cached_mount = seq_list_next(v, &p->ns->list, pos); - p->cached_index = *pos; - return p->cached_mount; + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; - struct mount *r = list_entry(v, struct mount, mnt_list); + struct mount *r = v; return p->show(m, &r->mnt); } @@ -1294,6 +1338,15 @@ const struct seq_operations mounts_op = { .stop = m_stop, .show = m_show, }; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} #endif /* CONFIG_PROC_FS */ /** @@ -3202,6 +3255,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3842,10 +3896,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, bool visible = false; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; + if (mnt_is_cursor(mnt)) + continue; + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -3893,6 +3951,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: + unlock_ns_list(ns); up_read(&namespace_sem); return visible; } diff --git a/fs/open.c b/fs/open.c index 719b320ede52..e62b1db06638 100644 --- a/fs/open.c +++ b/fs/open.c @@ -345,21 +345,14 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -long do_faccessat(int dfd, const char __user *filename, int mode) +static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; - struct path path; - struct inode *inode; - int res; - unsigned int lookup_flags = LOOKUP_FOLLOW; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return NULL; override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -394,6 +387,38 @@ long do_faccessat(int dfd, const char __user *filename, int mode) override_cred->non_rcu = 1; old_cred = override_creds(override_cred); + + /* override_cred() gets its own ref */ + put_cred(override_cred); + + return old_cred; +} + +long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +{ + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *old_cred = NULL; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (!(flags & AT_EACCESS)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } + retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) @@ -435,19 +460,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); - put_cred(override_cred); + if (old_cred) + revert_creds(old_cred); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } int ksys_chdir(const char __user *filename) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 273ee82d8aa9..e4d70c0dffe9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -279,7 +279,8 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - p->cached_event = ~0ULL; + INIT_LIST_HEAD(&p->cursor.mnt_list); + p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -296,6 +297,7 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); + mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/fs/stat.c b/fs/stat.c index 030008796479..b9faa6cafafe 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -22,6 +22,7 @@ #include <asm/unistd.h> #include "internal.h" +#include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct @@ -70,11 +71,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; /* allow the fs to override these if it really wants to */ - if (IS_NOATIME(inode)) + /* SB_NOATIME means filesystem supplies dummy atime value */ + if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; @@ -199,6 +200,11 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -563,6 +569,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); + tmp.stx_mnt_id = stat->mnt_id; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/utimes.c b/fs/utimes.c index 1d17ce98cb80..b7b927502d6e 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -95,13 +95,13 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, goto out; } - if (flags & ~AT_SYMLINK_NOFOLLOW) + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) goto out; if (filename == NULL && dfd != AT_FDCWD) { struct fd f; - if (flags & AT_SYMLINK_NOFOLLOW) + if (flags) goto out; f = fdget(dfd); @@ -117,6 +117,8 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 9a72214496e5..d02f32b7514e 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -44,6 +44,9 @@ static inline int devcgroup_inode_mknod(int mode, dev_t dev) if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; + if (S_ISCHR(mode) && dev == WHITEOUT_DEV) + return 0; + if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else diff --git a/include/linux/fs.h b/include/linux/fs.h index 45cc10cdf6dd..109b5d9dbdc7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1721,7 +1721,11 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); -extern int vfs_whiteout(struct inode *, struct dentry *); + +static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag); diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..7edac8c7a9c1 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,8 @@ struct fs_context; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ + MNT_CURSOR) #define MNT_INTERNAL 0x4000 @@ -64,6 +65,7 @@ struct fs_context; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 +#define MNT_CURSOR 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ diff --git a/include/linux/stat.h b/include/linux/stat.h index 528c4baad091..56614af83d4a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -47,6 +47,7 @@ struct kstat { struct timespec64 ctime; struct timespec64 btime; /* File creation time */ u64 blocks; + u64 mnt_id; }; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1815065d52f3..7c354c2955f5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -428,6 +428,8 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); #endif asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode); +asmlinkage long sys_faccessat2(int dfd, const char __user *filename, int mode, + int flags); asmlinkage long sys_chdir(const char __user *filename); asmlinkage long sys_fchdir(unsigned int fd); asmlinkage long sys_chroot(const char __user *filename); @@ -1333,11 +1335,11 @@ static inline int ksys_chmod(const char __user *filename, umode_t mode) return do_fchmodat(AT_FDCWD, filename, mode); } -extern long do_faccessat(int dfd, const char __user *filename, int mode); +long do_faccessat(int dfd, const char __user *filename, int mode, int flags); static inline long ksys_access(const char __user *filename, int mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } extern int do_fchownat(int dfd, const char __user *filename, uid_t user, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 3a3201e4618e..f4a01305d9a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -855,9 +855,11 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) #undef __NR_syscalls -#define __NR_syscalls 439 +#define __NR_syscalls 440 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ca88b7bce553..2f86b2ad6d7e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -84,10 +84,20 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ #define AT_FDCWD -100 /* Special value used to indicate openat should use the current working directory. */ #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ #define AT_REMOVEDIR 0x200 /* Remove directory instead of unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index ad80a5c885d5..6df9348bb277 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -123,7 +123,10 @@ struct statx { __u32 stx_dev_major; /* ID of device containing file [uncond] */ __u32 stx_dev_minor; /* 0x90 */ - __u64 __spare2[14]; /* Spare space for future expansion */ + __u64 stx_mnt_id; + __u64 __spare2; + /* 0xa0 */ + __u64 __spare3[12]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -148,9 +151,19 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_ALL 0x00000fffU /* All currently supported flags */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ + #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + /* * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * @@ -168,6 +181,7 @@ struct statx { #define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c index a3d68159fb51..76c577ea4fd8 100644 --- a/samples/vfs/test-statx.c +++ b/samples/vfs/test-statx.c @@ -216,7 +216,7 @@ int main(int argc, char **argv) struct statx stx; int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; - unsigned int mask = STATX_ALL; + unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; for (argv++; *argv; argv++) { if (strcmp(*argv, "-F") == 0) { diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index ad80a5c885d5..d1192783139a 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -148,9 +148,18 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_ALL 0x00000fffU /* All currently supported flags */ + #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + /* * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * |