diff options
Diffstat (limited to 'mm/mmap.c')
-rw-r--r-- | mm/mmap.c | 142 |
1 files changed, 90 insertions, 52 deletions
diff --git a/mm/mmap.c b/mm/mmap.c index f609e9ec4a25..bdd19f5b994e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -132,7 +132,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } - /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; @@ -238,14 +238,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_sem to read. + * __do_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; /* - * mm->brk must to be protected by write mmap_sem so update it - * before downgrading mmap_sem. When __do_munmap() fails, + * mm->brk must to be protected by write mmap_lock so update it + * before downgrading mmap_lock. When __do_munmap() fails, * mm->brk will be restored from origbrk. */ mm->brk = brk; @@ -272,9 +272,9 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; if (downgraded) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); else - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); @@ -282,7 +282,7 @@ success: out: retval = origbrk; - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return retval; } @@ -505,7 +505,7 @@ static __always_inline void vma_rb_erase(struct vm_area_struct *vma, * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * - * The entire update must be protected by exclusive mmap_sem and by + * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static inline void @@ -1030,7 +1030,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache - * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. */ static int @@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* - * Rough compatbility check to quickly see if it's even worth looking + * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ @@ -1361,15 +1361,15 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, } /* - * The caller must hold down_write(¤t->mm->mmap_sem). + * The caller must write-lock current->mm->mmap_lock. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate, - struct list_head *uf) + unsigned long flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; int pkey = 0; *populate = 0; @@ -1431,7 +1431,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -1562,11 +1562,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, file = fget(fd); if (!file) return -EBADF; - if (is_file_hugepages(file)) + if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); - retval = -EINVAL; - if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + } else if (unlikely(flags & MAP_HUGETLB)) { + retval = -EINVAL; goto out_fput; + } } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; struct hstate *hs; @@ -1689,7 +1690,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; + struct vm_area_struct *vma, *prev, *merge; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; @@ -1773,6 +1774,29 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (error) goto unmap_and_free_vma; + /* If vm_flags changed after call_mmap(), we should try merge vma again + * as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + if (merge) { + /* ->mmap() can change vma->vm_file and fput the original file. So + * fput the vma->vm_file here or we would add an extra fput for file + * and cause general protection fault ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags and possible addr to pick up the change. We don't + * warn here if addr changed as the vma is not linked by vma_link(). + */ + addr = vma->vm_start; + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their @@ -1795,6 +1819,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { +unmap_writable: if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); if (vm_flags & VM_DENYWRITE) @@ -2209,7 +2234,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. - * do_mmap_pgoff() will clear pgoff, so match alignment. + * do_mmap() will clear pgoff, so match alignment. */ pgoff = 0; get_area = shmem_get_unmapped_area; @@ -2371,7 +2396,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the + * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); @@ -2389,7 +2414,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (!error) { /* * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_sem + * updates, but we only hold a shared mmap_lock * lock here, so we need to protect against * concurrent vma expansions. * anon_vma_lock_write() doesn't help here, as @@ -2451,7 +2476,7 @@ int expand_downwards(struct vm_area_struct *vma, /* * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the + * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); @@ -2469,7 +2494,7 @@ int expand_downwards(struct vm_area_struct *vma, if (!error) { /* * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_sem + * updates, but we only hold a shared mmap_lock * lock here, so we need to protect against * concurrent vma expansions. * anon_vma_lock_write() doesn't help here, as @@ -2620,7 +2645,7 @@ static void unmap_region(struct mm_struct *mm, * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. */ -static void +static bool detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end) { @@ -2645,6 +2670,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, /* Kill the cache */ vmacache_invalidate(mm); + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (vma && (vma->vm_flags & VM_GROWSDOWN)) + return false; + if (prev && (prev->vm_flags & VM_GROWSUP)) + return false; + return true; } /* @@ -2825,10 +2861,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } /* Detach vmas from rbtree */ - detach_vmas_to_be_unmapped(mm, vma, prev, end); + if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + downgrade = false; if (downgrade) - downgrade_write(&mm->mmap_sem); + mmap_write_downgrade(mm); unmap_region(mm, vma, prev, start, end); @@ -2850,20 +2887,20 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) struct mm_struct *mm = current->mm; LIST_HEAD(uf); - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = __do_munmap(mm, start, len, &uf, downgrade); /* - * Returning 1 indicates mmap_sem is downgraded. + * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset * it to 0 before return. */ if (ret == 1) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); ret = 0; } else - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; @@ -2911,7 +2948,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; vma = find_vma(mm, start); @@ -2970,11 +3007,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } file = get_file(vma->vm_file); - ret = do_mmap_pgoff(vma->vm_file, start, size, + ret = do_mmap(vma->vm_file, start, size, prot, flags, pgoff, &populate, NULL); fput(file); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) @@ -3074,12 +3111,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (!len) return 0; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_brk_flags(addr, len, flags, &uf); populate = ((mm->def_flags & VM_LOCKED) != 0); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); @@ -3107,12 +3144,12 @@ void exit_mmap(struct mm_struct *mm) /* * Manually reap the mm to free as much memory as possible. * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_sem for + * this mm from further consideration. Taking mm->mmap_lock for * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_sem is + * reaper will not run on this mm again after mmap_lock is * dropped. * - * Nothing can be holding mm->mmap_sem here and the above call + * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * @@ -3123,8 +3160,8 @@ void exit_mmap(struct mm_struct *mm) (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + mmap_write_unlock(mm); } if (mm->locked_vm) { @@ -3159,6 +3196,7 @@ void exit_mmap(struct mm_struct *mm) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); + cond_resched(); } vm_unacct_memory(nr_accounted); } @@ -3189,7 +3227,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap_pgoff and in do_brk. + * Similarly in do_mmap and in do_brk. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); @@ -3437,7 +3475,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, } /* - * Called with mm->mmap_sem held for writing. + * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. @@ -3474,7 +3512,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares @@ -3504,7 +3542,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } @@ -3513,11 +3551,11 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * - * The caller must take the mmap_sem in write mode before calling + * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the - * mmap_sem until mm_drop_all_locks() returns. + * mmap_lock until mm_drop_all_locks() returns. * - * mmap_sem in write mode is required in order to block all operations + * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. @@ -3550,7 +3588,7 @@ int mm_take_all_locks(struct mm_struct *mm) struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(mmap_read_trylock(mm)); mutex_lock(&mm_all_locks_mutex); @@ -3622,7 +3660,7 @@ static void vm_unlock_mapping(struct address_space *mapping) } /* - * The mmap_sem cannot be released by the caller until + * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) @@ -3630,7 +3668,7 @@ void mm_drop_all_locks(struct mm_struct *mm) struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(mmap_read_trylock(mm)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { |