From c9fa2b07fa99e648b5042a32dbfa39ba68a190db Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 26 Jun 2022 20:45:07 +0200 Subject: mnt_idmapping: add vfs[g,u]id_into_k[g,u]id() Add two tiny helpers to conver a vfsuid into a kuid. Signed-off-by: Christian Brauner (Microsoft) --- include/linux/mnt_idmapping.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index 41dc80f8b67c..f6e5369d2928 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -333,6 +333,19 @@ static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns, return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid)); } +/** + * vfsuid_into_kuid - convert vfsuid into kuid + * @vfsuid: the vfsuid to convert + * + * This can be used when a vfsuid is committed as a kuid. + * + * Return: a kuid with the value of @vfsuid + */ +static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid) +{ + return AS_KUIDT(vfsuid); +} + /** * from_vfsgid - map a vfsgid into the filesystem idmapping * @mnt_userns: the mount's idmapping @@ -406,6 +419,19 @@ static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns, return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid)); } +/** + * vfsgid_into_kgid - convert vfsgid into kgid + * @vfsgid: the vfsgid to convert + * + * This can be used when a vfsgid is committed as a kgid. + * + * Return: a kgid with the value of @vfsgid + */ +static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) +{ + return AS_KGIDT(vfsgid); +} + /** * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns * @mnt_userns: the mount's idmapping -- cgit v1.2.3 From 0c5fd887d2bb47aa37aa9fb1eb1d1d2abac62972 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 6 Jul 2022 18:30:59 +0200 Subject: acl: move idmapped mount fixup into vfs_{g,s}etxattr() This cycle we added support for mounting overlayfs on top of idmapped mounts. Recently I've started looking into potential corner cases when trying to add additional tests and I noticed that reporting for POSIX ACLs is currently wrong when using idmapped layers with overlayfs mounted on top of it. I'm going to give a rather detailed explanation to both the origin of the problem and the solution. Let's assume the user creates the following directory layout and they have a rootfs /var/lib/lxc/c1/rootfs. The files in this rootfs are owned as you would expect files on your host system to be owned. For example, ~/.bashrc for your regular user would be owned by 1000:1000 and /root/.bashrc would be owned by 0:0. IOW, this is just regular boring filesystem tree on an ext4 or xfs filesystem. The user chooses to set POSIX ACLs using the setfacl binary granting the user with uid 4 read, write, and execute permissions for their .bashrc file: setfacl -m u:4:rwx /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc Now they to expose the whole rootfs to a container using an idmapped mount. So they first create: mkdir -pv /vol/contpool/{ctrover,merge,lowermap,overmap} mkdir -pv /vol/contpool/ctrover/{over,work} chown 10000000:10000000 /vol/contpool/ctrover/{over,work} The user now creates an idmapped mount for the rootfs: mount-idmapped/mount-idmapped --map-mount=b:0:10000000:65536 \ /var/lib/lxc/c2/rootfs \ /vol/contpool/lowermap This for example makes it so that /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc which is owned by uid and gid 1000 as being owned by uid and gid 10001000 at /vol/contpool/lowermap/home/ubuntu/.bashrc. Assume the user wants to expose these idmapped mounts through an overlayfs mount to a container. mount -t overlay overlay \ -o lowerdir=/vol/contpool/lowermap, \ upperdir=/vol/contpool/overmap/over, \ workdir=/vol/contpool/overmap/work \ /vol/contpool/merge The user can do this in two ways: (1) Mount overlayfs in the initial user namespace and expose it to the container. (2) Mount overlayfs on top of the idmapped mounts inside of the container's user namespace. Let's assume the user chooses the (1) option and mounts overlayfs on the host and then changes into a container which uses the idmapping 0:10000000:65536 which is the same used for the two idmapped mounts. Now the user tries to retrieve the POSIX ACLs using the getfacl command getfacl -n /vol/contpool/lowermap/home/ubuntu/.bashrc and to their surprise they see: # file: vol/contpool/merge/home/ubuntu/.bashrc # owner: 1000 # group: 1000 user::rw- user:4294967295:rwx group::r-- mask::rwx other::r-- indicating the the uid wasn't correctly translated according to the idmapped mount. The problem is how we currently translate POSIX ACLs. Let's inspect the callchain in this example: idmapped mount /vol/contpool/merge: 0:10000000:65536 caller's idmapping: 0:10000000:65536 overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */ sys_getxattr() -> path_getxattr() -> getxattr() -> do_getxattr() |> vfs_getxattr() | -> __vfs_getxattr() | -> handler->get == ovl_posix_acl_xattr_get() | -> ovl_xattr_get() | -> vfs_getxattr() | -> __vfs_getxattr() | -> handler->get() /* lower filesystem callback */ |> posix_acl_fix_xattr_to_user() { 4 = make_kuid(&init_user_ns, 4); 4 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 4); /* FAILURE */ -1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4); } If the user chooses to use option (2) and mounts overlayfs on top of idmapped mounts inside the container things don't look that much better: idmapped mount /vol/contpool/merge: 0:10000000:65536 caller's idmapping: 0:10000000:65536 overlayfs idmapping (ofs->creator_cred): 0:10000000:65536 sys_getxattr() -> path_getxattr() -> getxattr() -> do_getxattr() |> vfs_getxattr() | -> __vfs_getxattr() | -> handler->get == ovl_posix_acl_xattr_get() | -> ovl_xattr_get() | -> vfs_getxattr() | -> __vfs_getxattr() | -> handler->get() /* lower filesystem callback */ |> posix_acl_fix_xattr_to_user() { 4 = make_kuid(&init_user_ns, 4); 4 = mapped_kuid_fs(&init_user_ns, 4); /* FAILURE */ -1 = from_kuid(0:10000000:65536 /* caller's idmapping */, 4); } As is easily seen the problem arises because the idmapping of the lower mount isn't taken into account as all of this happens in do_gexattr(). But do_getxattr() is always called on an overlayfs mount and inode and thus cannot possible take the idmapping of the lower layers into account. This problem is similar for fscaps but there the translation happens as part of vfs_getxattr() already. Let's walk through an fscaps overlayfs callchain: setcap 'cap_net_raw+ep' /var/lib/lxc/c2/rootfs/home/ubuntu/.bashrc The expected outcome here is that we'll receive the cap_net_raw capability as we are able to map the uid associated with the fscap to 0 within our container. IOW, we want to see 0 as the result of the idmapping translations. If the user chooses option (1) we get the following callchain for fscaps: idmapped mount /vol/contpool/merge: 0:10000000:65536 caller's idmapping: 0:10000000:65536 overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */ sys_getxattr() -> path_getxattr() -> getxattr() -> do_getxattr() -> vfs_getxattr() -> xattr_getsecurity() -> security_inode_getsecurity() ________________________________ -> cap_inode_getsecurity() | | { V | 10000000 = make_kuid(0:0:4k /* overlayfs idmapping */, 10000000); | 10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); | /* Expected result is 0 and thus that we own the fscap. */ | 0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); | } | -> vfs_getxattr_alloc() | -> handler->get == ovl_other_xattr_get() | -> vfs_getxattr() | -> xattr_getsecurity() | -> security_inode_getsecurity() | -> cap_inode_getsecurity() | { | 0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); | 10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); | 10000000 = from_kuid(0:0:4k /* overlayfs idmapping */, 10000000); | |____________________________________________________________________| } -> vfs_getxattr_alloc() -> handler->get == /* lower filesystem callback */ And if the user chooses option (2) we get: idmapped mount /vol/contpool/merge: 0:10000000:65536 caller's idmapping: 0:10000000:65536 overlayfs idmapping (ofs->creator_cred): 0:10000000:65536 sys_getxattr() -> path_getxattr() -> getxattr() -> do_getxattr() -> vfs_getxattr() -> xattr_getsecurity() -> security_inode_getsecurity() _______________________________ -> cap_inode_getsecurity() | | { V | 10000000 = make_kuid(0:10000000:65536 /* overlayfs idmapping */, 0); | 10000000 = mapped_kuid_fs(0:0:4k /* no idmapped mount */, 10000000); | /* Expected result is 0 and thus that we own the fscap. */ | 0 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000000); | } | -> vfs_getxattr_alloc() | -> handler->get == ovl_other_xattr_get() | |-> vfs_getxattr() | -> xattr_getsecurity() | -> security_inode_getsecurity() | -> cap_inode_getsecurity() | { | 0 = make_kuid(0:0:4k /* lower s_user_ns */, 0); | 10000000 = mapped_kuid_fs(0:10000000:65536 /* idmapped mount */, 0); | 0 = from_kuid(0:10000000:65536 /* overlayfs idmapping */, 10000000); | |____________________________________________________________________| } -> vfs_getxattr_alloc() -> handler->get == /* lower filesystem callback */ We can see how the translation happens correctly in those cases as the conversion happens within the vfs_getxattr() helper. For POSIX ACLs we need to do something similar. However, in contrast to fscaps we cannot apply the fix directly to the kernel internal posix acl data structure as this would alter the cached values and would also require a rework of how we currently deal with POSIX ACLs in general which almost never take the filesystem idmapping into account (the noteable exception being FUSE but even there the implementation is special) and instead retrieve the raw values based on the initial idmapping. The correct values are then generated right before returning to userspace. The fix for this is to move taking the mount's idmapping into account directly in vfs_getxattr() instead of having it be part of posix_acl_fix_xattr_to_user(). To this end we split out two small and unexported helpers posix_acl_getxattr_idmapped_mnt() and posix_acl_setxattr_idmapped_mnt(). The former to be called in vfs_getxattr() and the latter to be called in vfs_setxattr(). Let's go back to the original example. Assume the user chose option (1) and mounted overlayfs on top of idmapped mounts on the host: idmapped mount /vol/contpool/merge: 0:10000000:65536 caller's idmapping: 0:10000000:65536 overlayfs idmapping (ofs->creator_cred): 0:0:4k /* initial idmapping */ sys_getxattr() -> path_getxattr() -> getxattr() -> do_getxattr() |> vfs_getxattr() | |> __vfs_getxattr() | | -> handler->get == ovl_posix_acl_xattr_get() | | -> ovl_xattr_get() | | -> vfs_getxattr() | | |> __vfs_getxattr() | | | -> handler->get() /* lower filesystem callback */ | | |> posix_acl_getxattr_idmapped_mnt() | | { | | 4 = make_kuid(&init_user_ns, 4); | | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4); | | 10000004 = from_kuid(&init_user_ns, 10000004); | | |_______________________ | | } | | | | | |> posix_acl_getxattr_idmapped_mnt() | | { | | V | 10000004 = make_kuid(&init_user_ns, 10000004); | 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004); | 10000004 = from_kuid(&init_user_ns, 10000004); | } |_________________________________________________ | | | | |> posix_acl_fix_xattr_to_user() | { V 10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004); /* SUCCESS */ 4 = from_kuid(0:10000000:65536 /* caller's idmapping */, 10000004); } And similarly if the user chooses option (1) and mounted overayfs on top of idmapped mounts inside the container: idmapped mount /vol/contpool/merge: 0:10000000:65536 caller's idmapping: 0:10000000:65536 overlayfs idmapping (ofs->creator_cred): 0:10000000:65536 sys_getxattr() -> path_getxattr() -> getxattr() -> do_getxattr() |> vfs_getxattr() | |> __vfs_getxattr() | | -> handler->get == ovl_posix_acl_xattr_get() | | -> ovl_xattr_get() | | -> vfs_getxattr() | | |> __vfs_getxattr() | | | -> handler->get() /* lower filesystem callback */ | | |> posix_acl_getxattr_idmapped_mnt() | | { | | 4 = make_kuid(&init_user_ns, 4); | | 10000004 = mapped_kuid_fs(0:10000000:65536 /* lower idmapped mount */, 4); | | 10000004 = from_kuid(&init_user_ns, 10000004); | | |_______________________ | | } | | | | | |> posix_acl_getxattr_idmapped_mnt() | | { V | 10000004 = make_kuid(&init_user_ns, 10000004); | 10000004 = mapped_kuid_fs(&init_user_ns /* no idmapped mount */, 10000004); | 10000004 = from_kuid(0(&init_user_ns, 10000004); | |_________________________________________________ | } | | | |> posix_acl_fix_xattr_to_user() | { V 10000004 = make_kuid(0:0:4k /* init_user_ns */, 10000004); /* SUCCESS */ 4 = from_kuid(0:10000000:65536 /* caller's idmappings */, 10000004); } The last remaining problem we need to fix here is ovl_get_acl(). During ovl_permission() overlayfs will call: ovl_permission() -> generic_permission() -> acl_permission_check() -> check_acl() -> get_acl() -> inode->i_op->get_acl() == ovl_get_acl() > get_acl() /* on the underlying filesystem) ->inode->i_op->get_acl() == /*lower filesystem callback */ -> posix_acl_permission() passing through the get_acl request to the underlying filesystem. This will retrieve the acls stored in the lower filesystem without taking the idmapping of the underlying mount into account as this would mean altering the cached values for the lower filesystem. So we block using ACLs for now until we decided on a nice way to fix this. Note this limitation both in the documentation and in the code. The most straightforward solution would be to have ovl_get_acl() simply duplicate the ACLs, update the values according to the idmapped mount and return it to acl_permission_check() so it can be used in posix_acl_permission() forgetting them afterwards. This is a bit heavy handed but fairly straightforward otherwise. Link: https://github.com/brauner/mount-idmapped/issues/9 Link: https://lore.kernel.org/r/20220708090134.385160-2-brauner@kernel.org Cc: Seth Forshee Cc: Amir Goldstein Cc: Vivek Goyal Cc: Christoph Hellwig Cc: Aleksa Sarai Cc: Miklos Szeredi Cc: linux-unionfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) --- fs/ksmbd/vfs.c | 2 +- fs/ksmbd/vfs.h | 2 +- fs/overlayfs/overlayfs.h | 3 +- fs/posix_acl.c | 147 +++++++++++++++++++++++++++++----------- fs/xattr.c | 25 +++++-- include/linux/posix_acl_xattr.h | 34 ++++++---- include/linux/xattr.h | 2 +- 7 files changed, 151 insertions(+), 64 deletions(-) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 05efcdf7a4a7..7c849024999f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, */ int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags) { int err; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 8c37aaf936ab..70da4c0ba7ad 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, int attr_name_len); int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e22e20f4811a..6ec815b84d48 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -249,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + (void *)value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 962d32468eb4..d954852a0158 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -375,8 +375,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, - i_user_ns(inode), + uid = mapped_kuid_fs(mnt_userns, &init_user_ns, pa->e_uid); if (uid_eq(uid, current_fsuid())) goto mask; @@ -390,8 +389,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), + gid = mapped_kgid_fs(mnt_userns, &init_user_ns, pa->e_gid); if (in_group_p(gid)) { found = 1; @@ -710,46 +708,127 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - struct user_namespace *mnt_userns, - void *value, size_t size, bool from_user) +static int posix_acl_fix_xattr_common(void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + int count; + + if (!header) + return -EINVAL; + if (size < sizeof(struct posix_acl_xattr_header)) + return -EINVAL; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return -EINVAL; + + count = posix_acl_xattr_count(size); + if (count < 0) + return -EINVAL; + if (count == 0) + return -EINVAL; + + return count; +} + +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; kuid_t uid; kgid_t gid; - if (!value) + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - if (size < sizeof(struct posix_acl_xattr_header)) + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, + vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, + vfsgid_into_kgid(vfsgid))); + break; + default: + break; + } + } +} + +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + kuid_t uid; + kgid_t gid; + + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) return; - if (count == 0) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = VFSUIDT_INIT(uid); + uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = VFSGIDT_INIT(gid); + gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); + break; + default: + break; + } + } +} + +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + kuid_t uid; + kgid_t gid; + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; for (end = entry + count; entry != end; entry++) { switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); - if (from_user) - uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); - else - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - if (from_user) - gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); - else - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -758,34 +837,20 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_from_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, - size, true); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); } -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_to_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, - size, false); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } /* diff --git a/fs/xattr.c b/fs/xattr.c index e8dd03e4561e..a1f4998bc6be 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -282,9 +282,15 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) + const char *name, void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, + (const void **)&value, size); if (error < 0) return error; size = error; } + if (size && is_posix_acl_xattr(name)) + posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - return __vfs_getxattr(dentry, inode, name, value, size); + error = __vfs_getxattr(dentry, inode, name, value, size); + if (error > 0 && is_posix_acl_xattr(name)) + posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); + return error; } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns, if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - ctx->kvalue, ctx->size); + posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - ctx->kvalue, error); + posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 1766e1de6956..b6bd3eac2bcc 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -33,21 +33,31 @@ posix_acl_xattr_count(size_t size) } #ifdef CONFIG_FS_POSIX_ACL -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size); -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size); +void posix_acl_fix_xattr_from_user(void *value, size_t size); +void posix_acl_fix_xattr_to_user(void *value, size_t size); +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size); +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size); #else -static inline void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +static inline void posix_acl_fix_xattr_from_user(void *value, size_t size) { } -static inline void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +static inline void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ +} +static inline void +posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, void *value, + size_t size) +{ +} +static inline void +posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, void *value, + size_t size) { } #endif diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 4c379d23ec6e..979a9d3e5bfb 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct user_namespace *, struct dentry *, const char *, - const void *, size_t, int); + void *, size_t, int); int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *); int __vfs_removexattr_locked(struct user_namespace *, struct dentry *, const char *, struct inode **); -- cgit v1.2.3 From e933c15f7621074ef6d5c137fe212996fb5038a8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 9 Jul 2022 12:23:20 +0200 Subject: acl: port to vfs{g,u}id_t Port the few remaining pieces to vfs{g,u}id_t and associated type safe helpers. Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d954852a0158..d4b60c18fda7 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -361,8 +361,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -370,28 +370,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(uid, current_fsuid())) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pa->e_uid); - if (uid_eq(uid, current_fsuid())) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - gid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(gid)) { + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pa->e_gid); - if (in_group_p(gid)) { + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -697,7 +697,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; -- cgit v1.2.3 From 8043bffd01833a8544f2466fb3804310d6e73d09 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 6 Jul 2022 17:13:23 +0200 Subject: acl: make posix_acl_clone() available to overlayfs The ovl_get_acl() function needs to alter the POSIX ACLs retrieved from the lower filesystem. Instead of hand-rolling a overlayfs specific posix_acl_clone() variant allow export it. It's not special and it's not deeply internal anyway. Link: https://lore.kernel.org/r/20220708090134.385160-3-brauner@kernel.org Cc: Seth Forshee Cc: Amir Goldstein Cc: Vivek Goyal Cc: Christoph Hellwig Cc: Aleksa Sarai Cc: Miklos Szeredi Cc: linux-unionfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) --- fs/posix_acl.c | 3 ++- include/linux/posix_acl.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d4b60c18fda7..1d17d7b13dcd 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ -static struct posix_acl * +struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) } return clone; } +EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index b65c877d92b8..7d1e604c1325 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -73,6 +73,7 @@ extern int set_posix_acl(struct user_namespace *, struct inode *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); +struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t); -- cgit v1.2.3 From 1aa5fef575a839a6d01bfacd7e912dfffd0a4345 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 6 Jul 2022 18:09:12 +0200 Subject: ovl: handle idmappings in ovl_get_acl() During permission checking overlayfs will call ovl_permission() -> generic_permission() -> acl_permission_check() -> check_acl() -> get_acl() -> inode->i_op->get_acl() == ovl_get_acl() -> get_acl() /* on the underlying filesystem */ -> inode->i_op->get_acl() == /*lower filesystem callback */ -> posix_acl_permission() passing through the get_acl() request to the underlying filesystem. Before returning these values to the VFS we need to take the idmapping of the relevant layer into account and translate any ACL_{GROUP,USER} values according to the idmapped mount. We cannot alter the ACLs returned from the relevant layer directly as that would alter the cached values filesystem wide for the lower filesystem. Instead we can clone the ACLs and then apply the relevant idmapping of the layer. This is obviously only relevant when idmapped layers are used. Link: https://lore.kernel.org/r/20220708090134.385160-4-brauner@kernel.org Cc: Seth Forshee Cc: Amir Goldstein Cc: Vivek Goyal Cc: Christoph Hellwig Cc: Aleksa Sarai Cc: Miklos Szeredi Cc: linux-unionfs@vger.kernel.org Reviewed-by: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) --- fs/overlayfs/inode.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 492eddeb481f..7922b619f6c8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -454,23 +454,94 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - const struct cred *old_cred; - struct posix_acl *acl; + struct posix_acl *acl, *clone; + struct path realpath; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (rcu) - return get_cached_acl_rcu(realinode, type); + /* Careful in RCU walk mode */ + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } - old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); - revert_creds(old_cred); + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; - return acl; + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; } int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) -- cgit v1.2.3 From 7c4d37c269ac8bf834b47718386d02ae94d54633 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 13 Jul 2022 11:47:44 +0200 Subject: Revert "ovl: turn of SB_POSIXACL with idmapped layers temporarily" This reverts commit 4a47c6385bb4e0786826e75bd4555aba32953653. Now that we have a proper fix for POSIX ACLs with overlayfs on top of idmapped layers revert the temporary fix. Signed-off-by: Christian Brauner (Microsoft) --- Documentation/filesystems/overlayfs.rst | 4 ---- fs/overlayfs/super.c | 25 +------------------------ 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index 316cfd8b1891..7da6c30ed596 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -466,10 +466,6 @@ overlay filesystem and the value of st_ino for filesystem objects may not be persistent and could change even while the overlay filesystem is mounted, as summarized in the `Inode properties`_ table above. -4) "idmapped mounts" -When the upper or lower layers are idmapped mounts overlayfs will be mounted -without support for POSIX Access Control Lists (ACLs). This limitation will -eventually be lifted. Changes to underlying filesystems --------------------------------- diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 1ce5c9698393..e0a2e0468ee7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1003,9 +1003,6 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - return ovl_xattr_get(dentry, inode, handler->name, buffer, size); } @@ -1021,9 +1018,6 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, struct posix_acl *acl = NULL; int err; - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - /* Check that everything is OK before copy-up */ if (value) { acl = posix_acl_from_xattr(&init_user_ns, value, size); @@ -1966,20 +1960,6 @@ static struct dentry *ovl_get_root(struct super_block *sb, return root; } -static bool ovl_has_idmapped_layers(struct ovl_fs *ofs) -{ - - unsigned int i; - const struct vfsmount *mnt; - - for (i = 0; i < ofs->numlayer; i++) { - mnt = ofs->layers[i].mnt; - if (mnt && is_idmapped_mnt(mnt)) - return true; - } - return false; -} - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { }; @@ -2149,10 +2129,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; - if (ovl_has_idmapped_layers(ofs)) - pr_warn("POSIX ACLs are not yet supported with idmapped layers, mounting without ACL support.\n"); - else - sb->s_flags |= SB_POSIXACL; + sb->s_flags |= SB_POSIXACL; sb->s_iflags |= SB_I_SKIP_SYNC; err = -ENOMEM; -- cgit v1.2.3 From ba40a57ff08bf606135866bfe5fddc572089ac16 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 26 Jul 2022 16:16:15 +0200 Subject: Add Seth Forshee as co-maintainer for idmapped mounts Seth has been integral in the design and implementation of idmapped mounts and was the main architect behind the s_user_ns work which ultimately made filesystems such as FUSE and overlayfs available in containers. He continues to be active in both development and review. I'm very happy he decided to maintain this feature. He has my full trust. Link: https://lore.kernel.org/r/20220726141615.1046027-1-brauner@kernel.org Cc: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 66bffb24a348..6defeb24d8b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9548,6 +9548,7 @@ F: drivers/input/misc/ideapad_slidebar.c IDMAPPED MOUNTS M: Christian Brauner +M: Seth Forshee L: linux-fsdevel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git -- cgit v1.2.3