summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c111
1 files changed, 105 insertions, 6 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1835d548fecc..8028fb7677eb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1014,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
vma->vm_private_data = (void *)0;
}
+/*
+ * Reset and decrement one ref on hugepage private reservation.
+ * Called with mm->mmap_sem writer semaphore held.
+ * This function should be only used by move_vma() and operate on
+ * same sized vma. It should never come here with last ref on the
+ * reservation.
+ */
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ /*
+ * Clear the old hugetlb private page reservation.
+ * It has already been transferred to new_vma.
+ *
+ * During a mremap() operation of a hugetlb vma we call move_vma()
+ * which copies vma into new_vma and unmaps vma. After the copy
+ * operation both new_vma and vma share a reference to the resv_map
+ * struct, and at that point vma is about to be unmapped. We don't
+ * want to return the reservation to the pool at unmap of vma because
+ * the reservation still lives on in new_vma, so simply decrement the
+ * ref here and remove the resv_map reference from this vma.
+ */
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ kref_put(&reservations->refs, resv_map_release);
+
+ reset_vma_resv_huge_pages(vma);
+}
+
/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
@@ -4718,6 +4747,82 @@ again:
return ret;
}
+static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pte_t *src_pte)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *dst_pte, pte;
+ spinlock_t *src_ptl, *dst_ptl;
+
+ dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
+ dst_ptl = huge_pte_lock(h, mm, dst_pte);
+ src_ptl = huge_pte_lockptr(h, mm, src_pte);
+
+ /*
+ * We don't have to worry about the ordering of src and dst ptlocks
+ * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
+ */
+ if (src_ptl != dst_ptl)
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ set_huge_pte_at(mm, new_addr, dst_pte, pte);
+
+ if (src_ptl != dst_ptl)
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+}
+
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+ struct vm_area_struct *new_vma,
+ unsigned long old_addr, unsigned long new_addr,
+ unsigned long len)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_end = old_addr + len;
+ unsigned long old_addr_copy;
+ pte_t *src_pte, *dst_pte;
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
+ old_end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
+ /* Prevent race with file truncation */
+ i_mmap_lock_write(mapping);
+ for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
+ src_pte = huge_pte_offset(mm, old_addr, sz);
+ if (!src_pte)
+ continue;
+ if (huge_pte_none(huge_ptep_get(src_pte)))
+ continue;
+
+ /* old_addr arg to huge_pmd_unshare() is a pointer and so the
+ * arg may be modified. Pass a copy instead to preserve the
+ * value in old_addr.
+ */
+ old_addr_copy = old_addr;
+
+ if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+ continue;
+
+ dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
+ if (!dst_pte)
+ break;
+
+ move_huge_pte(vma, old_addr, new_addr, src_pte);
+ }
+ i_mmap_unlock_write(mapping);
+ flush_tlb_range(vma, old_end - len, old_end);
+ mmu_notifier_invalidate_range_end(&range);
+
+ return len + old_addr - old_end;
+}
+
static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
@@ -6257,12 +6362,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* sharing is possible. For hugetlbfs, this prevents removal of any page
* table entries associated with the address space. This is important as we
* are setting up sharing based on existing page table entries (mappings).
- *
- * NOTE: This routine is only called from huge_pte_alloc. Some callers of
- * huge_pte_alloc know that sharing is not possible and do not take
- * i_mmap_rwsem as a performance optimization. This is handled by the
- * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
- * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)