diff options
Diffstat (limited to 'mm/mmap.c')
-rw-r--r-- | mm/mmap.c | 238 |
1 files changed, 193 insertions, 45 deletions
diff --git a/mm/mmap.c b/mm/mmap.c index 7a0707a48047..1af87c14183d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); - if (vma_wants_writenotify(vma)) { + vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, - vm_flags); + vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* @@ -400,15 +402,9 @@ static inline void vma_rb_insert(struct vm_area_struct *vma, rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) { /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the vma being erased. - */ - validate_mm_rb(root, vma); - - /* * Note rb_erase_augmented is a fairly large inline function, * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. @@ -416,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } +static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, + struct rb_root *root, + struct vm_area_struct *ignore) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the "next" vma being erased if + * next->vm_start was reduced. + */ + validate_mm_rb(root, ignore); + + __vma_rb_erase(vma, root); +} + +static __always_inline void vma_rb_erase(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + __vma_rb_erase(vma, root); +} + /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. @@ -599,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; } -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static __always_inline void __vma_unlink_common(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + bool has_prev, + struct vm_area_struct *ignore) { struct vm_area_struct *next; - vma_rb_erase(vma, &mm->mm_rb); - prev->vm_next = next = vma->vm_next; + vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); + next = vma->vm_next; + if (has_prev) + prev->vm_next = next; + else { + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + } if (next) next->vm_prev = prev; @@ -614,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, vmacache_invalidate(mm); } +static inline void __vma_unlink_prev(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + __vma_unlink_common(mm, vma, prev, true, vma); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -621,11 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; @@ -641,9 +682,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). + * The only other cases that gets here are + * case 1, case 7 and case 8. */ - remove_next = 1 + (end > next->vm_end); - end = next->vm_end; + if (next == expand) { + /* + * The only case where we don't expand "vma" + * and we expand "next" instead is case 8. + */ + VM_WARN_ON(end != next->vm_end); + /* + * remove_next == 3 means we're + * removing "vma" and that to do so we + * swapped "vma" and "next". + */ + remove_next = 3; + VM_WARN_ON(file != next->vm_file); + swap(vma, next); + } else { + VM_WARN_ON(expand != vma); + /* + * case 1, 6, 7, remove_next == 2 is case 6, + * remove_next == 1 is case 1 or 7. + */ + remove_next = 1 + (end > next->vm_end); + VM_WARN_ON(remove_next == 2 && + end != next->vm_next->vm_end); + VM_WARN_ON(remove_next == 1 && + end != next->vm_end); + /* trim end to next, for case 6 first pass */ + end = next->vm_end; + } + exporter = next; importer = vma; @@ -651,7 +721,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && next && !next->anon_vma) + if (remove_next == 2 && !next->anon_vma) exporter = next->vm_next; } else if (end > next->vm_start) { @@ -662,6 +732,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = (end - next->vm_start) >> PAGE_SHIFT; exporter = next; importer = vma; + VM_WARN_ON(expand != importer); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -671,6 +742,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); exporter = vma; importer = next; + VM_WARN_ON(expand != importer); } /* @@ -688,7 +760,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: - vma_adjust_trans_huge(vma, start, end, adjust_next); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -714,8 +786,8 @@ again: if (!anon_vma && adjust_next) anon_vma = next->anon_vma; if (anon_vma) { - VM_BUG_ON_VMA(adjust_next && next->anon_vma && - anon_vma != next->anon_vma, next); + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) @@ -755,7 +827,19 @@ again: * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. */ - __vma_unlink(mm, next, vma); + if (remove_next != 3) + __vma_unlink_prev(mm, next, vma); + else + /* + * vma is not before next if they've been + * swapped. + * + * pre-swap() next->vm_start was reduced so + * tell validate_mm_rb to ignore pre-swap() + * "next" (which is stored in post-swap() + * "vma"). + */ + __vma_unlink_common(mm, next, NULL, false, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -807,7 +891,27 @@ again: * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - next = vma->vm_next; + if (remove_next != 3) { + /* + * If "next" was removed and vma->vm_end was + * expanded (up) over it, in turn + * "next->vm_prev->vm_end" changed and the + * "vma->vm_next" gap must be updated. + */ + next = vma->vm_next; + } else { + /* + * For the scope of the comment "next" and + * "vma" considered pre-swap(): if "vma" was + * removed, next->vm_start was expanded (down) + * over it and the "next" gap must be updated. + * Because of the swap() the post-swap() "vma" + * actually points to pre-swap() "next" + * (post-swap() "next" as opposed is now a + * dangling pointer). + */ + next = vma; + } if (remove_next == 2) { remove_next = 1; end = next->vm_end; @@ -815,8 +919,28 @@ again: } else if (next) vma_gap_update(next); - else - mm->highest_vm_end = end; + else { + /* + * If remove_next == 2 we obviously can't + * reach this path. + * + * If remove_next == 3 we can't reach this + * path because pre-swap() next is always not + * NULL. pre-swap() "next" is not being + * removed and its next->vm_end is not altered + * (and furthermore "end" already matches + * next->vm_end in remove_next == 3). + * + * We reach this only in the remove_next == 1 + * case if the "next" vma that was removed was + * the highest vma of the mm. However in such + * case next->vm_end == "end" and the extended + * "vma" has vma->vm_end == next->vm_end so + * mm->highest_vm_end doesn't need any update + * in remove_next == 1 case. + */ + VM_WARN_ON(mm->highest_vm_end != end); + } } if (insert && file) uprobe_mmap(insert); @@ -936,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * cannot merge might become might become might become * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPNNNNNNNN 8 + * mremap move: PPPPXXXXXXXX 8 * AAAA * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: - * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + * It is important for case 8 that the the vma NNNN overlapping the + * region AAAA is never going to extended over XXXX. Instead XXXX must + * be extended in region AAAA and NNNN must be removed. This way in + * all cases where vma_merge succeeds, the moment vma_adjust drops the + * rmap_locks, the properties of the merged vma will be already + * correct for the whole merged range. Some of those properties like + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must + * be correct for the whole merged range immediately after the + * rmap_locks are released. Otherwise if XXXX would be removed and + * NNNN would be extended over the XXXX range, remove_migration_ptes + * or other rmap walkers (if working on addresses beyond the "end" + * parameter) may establish ptes with the wrong permissions of NNNN + * instead of the right permissions of XXXX. */ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -967,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, else next = mm->mmap; area = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ + if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; + /* verify some invariant that must be enforced by the caller */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(addr >= end); + /* * Can it merge with the predecessor? */ @@ -990,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); } else /* cases 2, 5, 7 */ - err = vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); if (err) return NULL; khugepaged_enter_vma_merge(prev, vm_flags); @@ -1010,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ - err = vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); - else /* cases 3, 8 */ - err = vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); + err = __vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL, next); + else { /* cases 3, 8 */ + err = __vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + /* + * In case 3 area is already equal to next and + * this is a noop, but in case 8 "area" has + * been removed and next was expanded over it. + */ + area = next; + } if (err) return NULL; khugepaged_enter_vma_merge(area, vm_flags); @@ -1386,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { vm_flags_t vm_flags = vma->vm_flags; const struct vm_operations_struct *vm_ops = vma->vm_ops; @@ -1401,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* The open routine did something to the protections that pgprot_modify * won't preserve? */ - if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; /* Do we need to track softdirty? */ |