From 35d7bdc86031a2c1ae05ac27dfa93b2acdcbaecc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:20:25 +0200 Subject: kernel/fork: factor out replacing the current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's factor the main logic out into replace_mm_exe_file(), such that all mm->exe_file logic is contained in kernel/fork.c. While at it, perform some simple cleanups that are possible now that we're simplifying the individual functions. Acked-by: Christian Brauner Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..48c6fa9ab792 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2581,6 +2581,7 @@ extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -- cgit v1.2.3 From fe69d560b5bd9ec77b5d5749bd7027344daef47e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:29:59 +0200 Subject: kernel/fork: always deny write access to current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to remove VM_DENYWRITE only currently only used when mapping the executable during exec. During exec, we already deny_write_access() the executable, however, after exec completes the VMAs mapped with VM_DENYWRITE effectively keeps write access denied via deny_write_access(). Let's deny write access when setting or replacing the MM exe_file. With this change, we can remove VM_DENYWRITE for mapping executables. Make set_mm_exe_file() return an error in case deny_write_access() fails; note that this should never happen, because exec code does a deny_write_access() early and keeps write access denied when calling set_mm_exe_file. However, it makes the code easier to read and makes set_mm_exe_file() and replace_mm_exe_file() look more similar. This represents a minor user space visible change: sys_prctl(PR_SET_MM_MAP/EXE_FILE) can now fail if the file is already opened writable. Also, after sys_prctl(PR_SET_MM_MAP/EXE_FILE) the file cannot be opened writable. Note that we can already fail with -EACCES if the file doesn't have execute permissions. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- fs/exec.c | 4 +++- include/linux/mm.h | 2 +- kernel/fork.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 48 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index 38f63451b928..9294049f5487 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1270,7 +1270,9 @@ int begin_new_exec(struct linux_binprm * bprm) * not visibile until then. This also enables the update * to be lockless. */ - set_mm_exe_file(bprm->mm, bprm->file); + retval = set_mm_exe_file(bprm->mm, bprm->file); + if (retval) + goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); diff --git a/include/linux/mm.h b/include/linux/mm.h index 48c6fa9ab792..56b1cd41db61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2580,7 +2580,7 @@ static inline int check_data_rlimit(unsigned long rlim, extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index f4ac883c4a1e..7677f897ecb6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -470,6 +470,20 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct file *exe_file; + + exe_file = get_mm_exe_file(oldmm); + RCU_INIT_POINTER(mm->exe_file, exe_file); + /* + * We depend on the oldmm having properly denied write access to the + * exe_file already. + */ + if (exe_file && deny_write_access(exe_file)) + pr_warn_once("deny_write_access() failed in %s\n", __func__); +} + #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) @@ -493,7 +507,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; @@ -639,7 +653,7 @@ static inline void mm_free_pgd(struct mm_struct *mm) static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { mmap_write_lock(oldmm); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mmap_write_unlock(oldmm); return 0; } @@ -1149,8 +1163,10 @@ void mmput_async(struct mm_struct *mm) * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single * threaded. + * + * Can only fail if new_exe_file != NULL. */ -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; @@ -1161,11 +1177,21 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) + if (new_exe_file) { + /* + * We expect the caller (i.e., sys_execve) to already denied + * write access, so this is unlikely to fail. + */ + if (unlikely(deny_write_access(new_exe_file))) + return -EACCES; get_file(new_exe_file); + } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + allow_write_access(old_exe_file); fput(old_exe_file); + } + return 0; } /** @@ -1201,10 +1227,22 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) } /* set the new file, lockless */ + ret = deny_write_access(new_exe_file); + if (ret) + return -EACCES; get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + /* + * Don't race with dup_mmap() getting the file and disallowing + * write access while someone might open the file writable. + */ + mmap_read_lock(mm); + allow_write_access(old_exe_file); fput(old_exe_file); + mmap_read_unlock(mm); + } return 0; } -- cgit v1.2.3 From 8d0920bde5eb8ec7e567939b85e65a0596c8580d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 22 Apr 2021 12:08:20 +0200 Subject: mm: remove VM_DENYWRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All in-tree users of MAP_DENYWRITE are gone. MAP_DENYWRITE cannot be set from user space, so all users are gone; let's remove it. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- fs/proc/task_mmu.c | 1 - include/linux/mm.h | 1 - include/linux/mman.h | 1 - include/trace/events/mmflags.h | 1 - kernel/events/core.c | 2 -- kernel/fork.c | 3 --- lib/test_printf.c | 5 ++--- mm/mmap.c | 27 +++------------------------ 8 files changed, 5 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index eb97468dfe4c..cf25be3e0321 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -619,7 +619,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", - [ilog2(VM_DENYWRITE)] = "dw", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index 56b1cd41db61..257995f62e83 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -281,7 +281,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 diff --git a/include/linux/mman.h b/include/linux/mman.h index ebb09a964272..bd9aadda047b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -153,7 +153,6 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | - _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | arch_calc_vm_flag_bits(flags); diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index f160484afc5c..0b53e855c4ac 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -165,7 +165,6 @@ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison") {VM_UFFD_MISSING, "uffd_missing" }, \ IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ - {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ {VM_LOCKED, "locked" }, \ {VM_IO, "io" }, \ diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..19767bb9933c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8307,8 +8307,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) else flags = MAP_PRIVATE; - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) diff --git a/kernel/fork.c b/kernel/fork.c index 7677f897ecb6..feef1057081d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -570,12 +570,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); file = tmp->vm_file; if (file) { - struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); diff --git a/lib/test_printf.c b/lib/test_printf.c index 8ac71aee46af..8a48b61c3763 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -675,9 +675,8 @@ flags(void) "uptodate|dirty|lru|active|swapbacked", cmp_buffer); - flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC - | VM_DENYWRITE; - test("read|exec|mayread|maywrite|mayexec|denywrite", "%pGv", &flags); + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + test("read|exec|mayread|maywrite|mayexec", "%pGv", &flags); gfp = GFP_TRANSHUGE; test("GFP_TRANSHUGE", "%pGg", &gfp); diff --git a/mm/mmap.c b/mm/mmap.c index ca54d36d203a..589dc1dc13db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -148,8 +148,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_DENYWRITE) - allow_write_access(file); if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); @@ -666,8 +664,6 @@ static void __vma_link_file(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - if (vma->vm_flags & VM_DENYWRITE) - put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); @@ -1788,22 +1784,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_DENYWRITE) { - error = deny_write_access(file); - if (error) - goto free_vma; - } if (vm_flags & VM_SHARED) { error = mapping_map_writable(file->f_mapping); if (error) - goto allow_write_and_free_vma; + goto free_vma; } - /* ->mmap() can change vma->vm_file, but must guarantee that - * vma_link() below can deny write-access if VM_DENYWRITE is set - * and map writably if VM_SHARED is set. This usually means the - * new file must not have been exposed to user-space, yet. - */ vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) @@ -1860,13 +1846,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (file) { unmap_writable: - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); - } + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); file = vma->vm_file; out: perf_event_mmap(vma); @@ -1906,9 +1888,6 @@ unmap_and_free_vma: charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); -allow_write_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); free_vma: vm_area_free(vma); unacct_error: -- cgit v1.2.3 From 6128b3af2a5e42386aa7faf37609b57f39fb7d00 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 09:38:14 +0200 Subject: mm: ignore MAP_DENYWRITE in ksys_mmap_pgoff() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's also remove masking off MAP_DENYWRITE from ksys_mmap_pgoff(): the last in-tree occurrence of MAP_DENYWRITE is now in LEGACY_MAP_MASK, which accepts the flag e.g., for MAP_SHARED_VALIDATE; however, the flag is ignored throughout the kernel now. Add a comment to LEGACY_MAP_MASK stating that MAP_DENYWRITE is ignored. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mman.h | 3 ++- mm/mmap.c | 2 -- mm/nommu.c | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mman.h b/include/linux/mman.h index bd9aadda047b..b66e91b8176c 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -32,7 +32,8 @@ * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. * - * MAP_EXECUTABLE is completely ignored throughout the kernel. + * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the + * kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ diff --git a/mm/mmap.c b/mm/mmap.c index 589dc1dc13db..bf11fc6e8311 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1626,8 +1626,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, return PTR_ERR(file); } - flags &= ~MAP_DENYWRITE; - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) diff --git a/mm/nommu.c b/mm/nommu.c index 3a93d4054810..0987d131bdfc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1296,8 +1296,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out; } - flags &= ~MAP_DENYWRITE; - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) -- cgit v1.2.3 From 592ca09be8333bd226f50100328a905bfc377133 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 09:45:45 +0200 Subject: fs: update documentation of get_write_access() and friends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As VM_DENYWRITE does no longer exists, let's spring-clean the documentation of get_write_access() and friends. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/fs.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..e0dc3e96ed72 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3055,15 +3055,20 @@ static inline void file_end_write(struct file *file) } /* + * This is used for regular files where some users -- especially the + * currently executed binary in a process, previously handled via + * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap + * read-write shared) accesses. + * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. - * This is used for regular files. - * We cannot support write (and maybe mmap read-write shared) accesses and - * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode - * can have the following values: - * 0: no writers, no VM_DENYWRITE mappings - * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist - * > 0: (i_writecount) users are writing to the file. + * deny_write_access() denies write access to a file. + * allow_write_access() re-enables write access to a file. + * + * The i_writecount field of an inode can have the following values: + * 0: no write access, no denied write access + * < 0: (-i_writecount) users that denied write access to the file. + * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to -- cgit v1.2.3