From 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 9 Dec 2022 09:09:12 +0100 Subject: mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now only shmem and hugetlb). The consequence is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...). So far, vma->vm_page_prot is assumed to be a safe default, meaning that we only add permissions (e.g., mkwrite) but not remove permissions (e.g., wrprotect). For example, when enabling softdirty tracking, we enable writenotify. With uffd-wp on shared mappings, that changed. More details on vma->vm_page_prot semantics were summarized in [1]. This is problematic for uffd-wp: we'd have to manually check for a uffd-wp PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone. Prone to such issues is any code that uses vma->vm_page_prot to set PTE permissions: primarily pte_modify() and mk_pte(). Instead, let's enable writenotify such that PTEs/PMDs/... will be mapped write-protected as default and we will only allow selected PTEs that are definitely safe to be mapped without write-protection (see can_change_pte_writable()) to be writable. In the future, we might want to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more locations, for example, also when removing uffd-wp protection. This fixes two known cases: (a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting in uffd-wp not triggering on write access. (b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs writable, resulting in uffd-wp not triggering on write access. Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even without NUMA hinting (which currently doesn't seem to be applicable to shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA. On such a VMA, userfaultfd-wp is currently non-functional. Note that when enabling userfaultfd-wp, there is no need to walk page tables to enforce the new default protection for the PTEs: we know that they cannot be uffd-wp'ed yet, because that can only happen after enabling uffd-wp for the VMA in general. Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not accidentally set the write bit -- which would result in uffd-wp not triggering on later write access. This commit makes uffd-wp on shmem behave just like uffd-wp on anonymous memory in that regard, even though, mixing mprotect with uffd-wp is controversial. [1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand Reported-by: Ives van Hoorne Debugged-by: Peter Xu Acked-by: Peter Xu Cc: Hugh Dickins Cc: Alistair Popple Cc: Mike Rapoport Cc: Nadav Amit Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 98ac37e34e3d..cc694846617a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + + vma->vm_flags = flags; + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, + vma->vm_flags & ~__VM_UFFD_FLAGS); } } mmap_write_unlock(mm); @@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; } @@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } } @@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } mmap_write_unlock(mm); @@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) @@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; skip: -- cgit v1.2.3 From 7633355e5c7f29c049a9048e461427d1d8ed3051 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 5 Jan 2023 14:53:56 +0900 Subject: nilfs2: fix general protection fault in nilfs_btree_insert() If nilfs2 reads a corrupted disk image and tries to reads a b-tree node block by calling __nilfs_btree_get_block() against an invalid virtual block address, it returns -ENOENT because conversion of the virtual block address to a disk block address fails. However, this return value is the same as the internal code that b-tree lookup routines return to indicate that the block being searched does not exist, so functions that operate on that b-tree may misbehave. When nilfs_btree_insert() receives this spurious 'not found' code from nilfs_btree_do_lookup(), it misunderstands that the 'not found' check was successful and continues the insert operation using incomplete lookup path data, causing the following crash: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] ... RIP: 0010:nilfs_btree_get_nonroot_node fs/nilfs2/btree.c:418 [inline] RIP: 0010:nilfs_btree_prepare_insert fs/nilfs2/btree.c:1077 [inline] RIP: 0010:nilfs_btree_insert+0x6d3/0x1c10 fs/nilfs2/btree.c:1238 Code: bc 24 80 00 00 00 4c 89 f8 48 c1 e8 03 42 80 3c 28 00 74 08 4c 89 ff e8 4b 02 92 fe 4d 8b 3f 49 83 c7 28 4c 89 f8 48 c1 e8 03 <42> 80 3c 28 00 74 08 4c 89 ff e8 2e 02 92 fe 4d 8b 3f 49 83 c7 02 ... Call Trace: nilfs_bmap_do_insert fs/nilfs2/bmap.c:121 [inline] nilfs_bmap_insert+0x20d/0x360 fs/nilfs2/bmap.c:147 nilfs_get_block+0x414/0x8d0 fs/nilfs2/inode.c:101 __block_write_begin_int+0x54c/0x1a80 fs/buffer.c:1991 __block_write_begin fs/buffer.c:2041 [inline] block_write_begin+0x93/0x1e0 fs/buffer.c:2102 nilfs_write_begin+0x9c/0x110 fs/nilfs2/inode.c:261 generic_perform_write+0x2e4/0x5e0 mm/filemap.c:3772 __generic_file_write_iter+0x176/0x400 mm/filemap.c:3900 generic_file_write_iter+0xab/0x310 mm/filemap.c:3932 call_write_iter include/linux/fs.h:2186 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x7dc/0xc50 fs/read_write.c:584 ksys_write+0x177/0x2a0 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd ... This patch fixes the root cause of this problem by replacing the error code that __nilfs_btree_get_block() returns on block address conversion failure from -ENOENT to another internal code -EINVAL which means that the b-tree metadata is corrupted. By returning -EINVAL, it propagates without glitches, and for all relevant b-tree operations, functions in the upper bmap layer output an error message indicating corrupted b-tree metadata via nilfs_bmap_convert_error(), and code -EIO will be eventually returned as it should be. Link: https://lkml.kernel.org/r/000000000000bd89e205f0e38355@google.com Link: https://lkml.kernel.org/r/20230105055356.8811-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+ede796cecd5296353515@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/btree.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b9d15c3df3cc..40ce92a332fe 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -480,9 +480,18 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh, &submit_ptr); if (ret) { - if (ret != -EEXIST) - return ret; - goto out_check; + if (likely(ret == -EEXIST)) + goto out_check; + if (ret == -ENOENT) { + /* + * Block address translation failed due to invalid + * value of 'ptr'. In this case, return internal code + * -EINVAL (broken bmap) to notify bmap layer of fatal + * metadata corruption. + */ + ret = -EINVAL; + } + return ret; } if (ra) { -- cgit v1.2.3