summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c469
1 files changed, 283 insertions, 186 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..388dcf9aa283 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
- if (vma->vm_ops)
- printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
- vma->vm_ops->fault);
- if (vma->vm_file)
- printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
- vma->vm_file->f_op->mmap);
+ pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+ vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->fault : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ mapping ? mapping->a_ops->readpage : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
@@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
- *
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Handle write page faults for pages that can be reused in the current vma
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+static inline int wp_page_reuse(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ struct page *page, int page_mkwrite,
+ int dirty_shared)
__releases(ptl)
{
- struct page *old_page, *new_page = NULL;
pte_t entry;
- int ret = 0;
- int page_mkwrite = 0;
- bool dirty_shared = false;
- unsigned long mmun_start = 0; /* For mmu_notifiers */
- unsigned long mmun_end = 0; /* For mmu_notifiers */
- struct mem_cgroup *memcg;
-
- old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page) {
- /*
- * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
- *
- * We should not cow pages in a shared writeable mapping.
- * Just mark the pages writable as we can't do any dirty
- * accounting on raw pfn maps.
- */
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- goto reuse;
- goto gotten;
- }
-
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
- if (!trylock_page(old_page)) {
- page_cache_get(old_page);
- pte_unmap_unlock(page_table, ptl);
- lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_cache_release(old_page);
- }
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
- unlock_page(old_page);
- goto reuse;
- }
- unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
- if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
-
- pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
- if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
- return tmp;
- }
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_mkwrite = 1;
- }
-
- dirty_shared = true;
-
-reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
- flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry,1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
- ret |= VM_FAULT_WRITE;
+ if (page)
+ page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
- if (!page_mkwrite)
- lock_page(old_page);
+ if (dirty_shared) {
+ struct address_space *mapping;
+ int dirtied;
- dirtied = set_page_dirty(old_page);
- VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
- mapping = old_page->mapping;
- unlock_page(old_page);
- page_cache_release(old_page);
+ if (!page_mkwrite)
+ lock_page(page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ mapping = page->mapping;
+ unlock_page(page);
+ page_cache_release(page);
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
- return ret;
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
}
- /*
- * Ok, we need to copy. Oh, well..
- */
- page_cache_get(old_page);
-gotten:
- pte_unmap_unlock(page_table, ptl);
+ return VM_FAULT_WRITE;
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ * relevant references. This includes dropping the reference the page-table
+ * held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ pte_t orig_pte, struct page *old_page)
+{
+ struct page *new_page = NULL;
+ spinlock_t *ptl = NULL;
+ pte_t entry;
+ int page_copied = 0;
+ const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
+ const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
+ struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2158,13 +2081,12 @@ gotten:
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
- __SetPageUptodate(new_page);
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
- mmun_start = address & PAGE_MASK;
- mmun_end = mmun_start + PAGE_SIZE;
+ __SetPageUptodate(new_page);
+
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
@@ -2177,8 +2099,9 @@ gotten:
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- } else
+ } else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
/* Free the old page.. */
new_page = old_page;
- ret |= VM_FAULT_WRITE;
- } else
+ page_copied = 1;
+ } else {
mem_cgroup_cancel_charge(new_page, memcg);
+ }
if (new_page)
page_cache_release(new_page);
-unlock:
+
pte_unmap_unlock(page_table, ptl);
- if (mmun_end > mmun_start)
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
- return ret;
+ return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
page_cache_release(new_page);
oom:
@@ -2258,6 +2181,179 @@ oom:
return VM_FAULT_OOM;
}
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ pmd_t *pmd)
+{
+ if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+ struct vm_fault vmf = {
+ .page = NULL,
+ .pgoff = linear_page_index(vma, address),
+ .virtual_address = (void __user *)(address & PAGE_MASK),
+ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+ };
+ int ret;
+
+ pte_unmap_unlock(page_table, ptl);
+ ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+ if (ret & VM_FAULT_ERROR)
+ return ret;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ /*
+ * We might have raced with another page fault while we
+ * released the pte_offset_map_lock.
+ */
+ if (!pte_same(*page_table, orig_pte)) {
+ pte_unmap_unlock(page_table, ptl);
+ return 0;
+ }
+ }
+ return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+ NULL, 0, 0);
+}
+
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table,
+ pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+ struct page *old_page)
+ __releases(ptl)
+{
+ int page_mkwrite = 0;
+
+ page_cache_get(old_page);
+
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ int tmp;
+
+ pte_unmap_unlock(page_table, ptl);
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
+ }
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_mkwrite = 1;
+ }
+
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, page_mkwrite, 1);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
+{
+ struct page *old_page;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable and/or call ops->pfn_mkwrite.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ return wp_pfn_shared(mm, vma, address, page_table, ptl,
+ orig_pte, pmd);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_cache_release(old_page);
+ }
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+ }
+ unlock_page(old_page);
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ return wp_page_shared(mm, vma, address, page_table, pmd,
+ ptl, orig_pte, old_page);
+ }
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ page_cache_get(old_page);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+}
+
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
@@ -2574,6 +2670,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap(page_table);
+ /* File mapping without ->vm_ops ? */
+ if (vma->vm_flags & VM_SHARED)
+ return VM_FAULT_SIGBUS;
+
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGSEGV;
@@ -2594,6 +2694,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ goto oom_free_page;
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
@@ -2601,9 +2705,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
- goto oom_free_page;
-
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -2784,7 +2885,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
struct vm_fault vmf;
int off;
- nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+ nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
start_addr = max(address & mask, vma->vm_start);
@@ -3002,6 +3103,9 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
+ /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
+ if (!vma->vm_ops->fault)
+ return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
@@ -3147,13 +3251,12 @@ static int handle_pte_fault(struct mm_struct *mm,
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops) {
- if (likely(vma->vm_ops->fault))
- return do_fault(mm, vma, address, pte,
- pmd, flags, entry);
- }
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ if (vma->vm_ops)
+ return do_fault(mm, vma, address, pte, pmd,
+ flags, entry);
+
+ return do_anonymous_page(mm, vma, address, pte, pmd,
+ flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
@@ -3629,7 +3732,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
if (buf) {
char *p;
- p = d_path(&f->f_path, buf, PAGE_SIZE);
+ p = file_path(f, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
@@ -3642,7 +3745,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
-void might_fault(void)
+void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3652,21 +3755,15 @@ void might_fault(void)
*/
if (segment_eq(get_fs(), KERNEL_DS))
return;
-
- /*
- * it would be nicer only to annotate paths which are not under
- * pagefault_disable, however that requires a larger audit and
- * providing helpers like get_user_atomic.
- */
- if (in_atomic())
+ if (pagefault_disabled())
return;
-
- __might_sleep(__FILE__, __LINE__, 0);
-
+ __might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_sem);
+#endif
}
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)