diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 54 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 8 | ||||
-rw-r--r-- | mm/fadvise.c | 6 | ||||
-rw-r--r-- | mm/gup.c | 29 | ||||
-rw-r--r-- | mm/hmm.c | 1257 | ||||
-rw-r--r-- | mm/huge_memory.c | 174 | ||||
-rw-r--r-- | mm/interval_tree.c | 10 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 256 | ||||
-rw-r--r-- | mm/memory.c | 138 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 14 | ||||
-rw-r--r-- | mm/mempolicy.c | 142 | ||||
-rw-r--r-- | mm/migrate.c | 1007 | ||||
-rw-r--r-- | mm/mlock.c | 10 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/mprotect.c | 18 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 13 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 28 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 50 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 11 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/vmstat.c | 154 | ||||
-rw-r--r-- | mm/zsmalloc.c | 17 |
28 files changed, 3180 insertions, 241 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0ded10a22639..9c4bdddd80c2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,6 +262,9 @@ config MIGRATION config ARCH_ENABLE_HUGEPAGE_MIGRATION bool +config ARCH_ENABLE_THP_MIGRATION + bool + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -673,7 +676,7 @@ config ARCH_HAS_ZONE_DEVICE bool config ZONE_DEVICE - bool "Device memory (pmem, etc...) hotplug support" + bool "Device memory (pmem, HMM, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP @@ -689,6 +692,55 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM + bool + default y + depends on (X86_64 || PPC64) + depends on ZONE_DEVICE + depends on MMU && 64BIT + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + +config MIGRATE_VMA_HELPER + bool + +config HMM + bool + select MIGRATE_VMA_HELPER + +config HMM_MIRROR + bool "HMM mirror CPU page table into a device page table" + depends on ARCH_HAS_HMM + select MMU_NOTIFIER + select HMM + help + Select HMM_MIRROR if you want to mirror range of the CPU page table of a + process into a device page table. Here, mirror means "keep synchronized". + Prerequisites: the device must provide the ability to write-protect its + page tables (at PAGE_SIZE granularity), and must be able to recover from + the resulting potential page faults. + +config DEVICE_PRIVATE + bool "Unaddressable device memory (GPU memory, ...)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent unaddressable device + memory; i.e., memory that is only accessible from the device (or + group of devices). You likely also want to select HMM_MIRROR. + +config DEVICE_PUBLIC + bool "Addressable device memory (like GPU memory)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent addressable device + memory; i.e., memory that is accessible from both the device and + the CPU + config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index 411bd24d4a7c..e3ac3aeb533b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o +obj-$(CONFIG_HMM) += hmm.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index b06d9fe23a28..68d28924ba79 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping, { struct balloon_dev_info *balloon = balloon_page_device(page); + /* + * We can not easily support the no copy case here so ignore it as it + * is unlikely to be use with ballon pages. See include/linux/hmm.h for + * user of the MIGRATE_SYNC_NO_COPY mode. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/fadvise.c b/mm/fadvise.c index a43013112581..702f239cd6db 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -52,7 +52,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (IS_DAX(inode)) { + bdi = inode_to_bdi(mapping->host); + + if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: @@ -75,8 +77,6 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) else endbyte--; /* inclusive */ - bdi = inode_to_bdi(mapping->host); - switch (advice) { case POSIX_FADV_NORMAL: f.file->f_ra.ra_pages = bdi->ra_pages; @@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } +retry: + if (!pmd_present(*pmd)) { + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(*pmd)); + if (is_pmd_migration_entry(*pmd)) + pmd_migration_entry_wait(mm, pmd); + goto retry; + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); +retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_present(*pmd))) { + spin_unlock(ptl); + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); @@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); @@ -438,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); + + /* + * This should never happen (a device public page in the gate + * area). + */ + if (is_device_public_page(*page)) + goto unmap; } get_page(*page); out: @@ -1534,7 +1559,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/hmm.c b/mm/hmm.c new file mode 100644 index 000000000000..a88a847bccba --- /dev/null +++ b/mm/hmm.c @@ -0,0 +1,1257 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse <jglisse@redhat.com> + */ +/* + * Refer to include/linux/hmm.h for information about heterogeneous memory + * management or HMM for short. + */ +#include <linux/mm.h> +#include <linux/hmm.h> +#include <linux/init.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> +#include <linux/memremap.h> +#include <linux/jump_label.h> +#include <linux/mmu_notifier.h> +#include <linux/memory_hotplug.h> + +#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) + +#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) +/* + * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h + */ +DEFINE_STATIC_KEY_FALSE(device_private_key); +EXPORT_SYMBOL(device_private_key); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ + + +#if IS_ENABLED(CONFIG_HMM_MIRROR) +static const struct mmu_notifier_ops hmm_mmu_notifier_ops; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @sequence: we track updates to the CPU page table with a sequence number + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + */ +struct hmm { + struct mm_struct *mm; + spinlock_t lock; + atomic_t sequence; + struct list_head ranges; + struct list_head mirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; +}; + +/* + * hmm_register - register HMM against an mm (HMM internal) + * + * @mm: mm struct to attach to + * + * This is not intended to be used directly by device drivers. It allocates an + * HMM struct if mm does not have one, and initializes it. + */ +static struct hmm *hmm_register(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + bool cleanup = false; + + /* + * The hmm struct can only be freed once the mm_struct goes away, + * hence we should always have pre-allocated an new hmm struct + * above. + */ + if (hmm) + return hmm; + + hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + if (!hmm) + return NULL; + INIT_LIST_HEAD(&hmm->mirrors); + init_rwsem(&hmm->mirrors_sem); + atomic_set(&hmm->sequence, 0); + hmm->mmu_notifier.ops = NULL; + INIT_LIST_HEAD(&hmm->ranges); + spin_lock_init(&hmm->lock); + hmm->mm = mm; + + /* + * We should only get here if hold the mmap_sem in write mode ie on + * registration of first mirror through hmm_mirror_register() + */ + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } + + spin_lock(&mm->page_table_lock); + if (!mm->hmm) + mm->hmm = hmm; + else + cleanup = true; + spin_unlock(&mm->page_table_lock); + + if (cleanup) { + mmu_notifier_unregister(&hmm->mmu_notifier, mm); + kfree(hmm); + } + + return mm->hmm; +} + +void hmm_mm_destroy(struct mm_struct *mm) +{ + kfree(mm->hmm); +} + +static void hmm_invalidate_range(struct hmm *hmm, + enum hmm_update_type action, + unsigned long start, + unsigned long end) +{ + struct hmm_mirror *mirror; + struct hmm_range *range; + + spin_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { + unsigned long addr, idx, npages; + + if (end < range->start || start >= range->end) + continue; + + range->valid = false; + addr = max(start, range->start); + idx = (addr - range->start) >> PAGE_SHIFT; + npages = (min(range->end, end) - addr) >> PAGE_SHIFT; + memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); + } + spin_unlock(&hmm->lock); + + down_read(&hmm->mirrors_sem); + list_for_each_entry(mirror, &hmm->mirrors, list) + mirror->ops->sync_cpu_device_pagetables(mirror, action, + start, end); + up_read(&hmm->mirrors_sem); +} + +static void hmm_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hmm *hmm = mm->hmm; + + VM_BUG_ON(!hmm); + + atomic_inc(&hmm->sequence); +} + +static void hmm_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hmm *hmm = mm->hmm; + + VM_BUG_ON(!hmm); + + hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); +} + +static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .invalidate_range_start = hmm_invalidate_range_start, + .invalidate_range_end = hmm_invalidate_range_end, +}; + +/* + * hmm_mirror_register() - register a mirror against an mm + * + * @mirror: new mirror struct to register + * @mm: mm to register against + * + * To start mirroring a process address space, the device driver must register + * an HMM mirror struct. + * + * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! + */ +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) +{ + /* Sanity check */ + if (!mm || !mirror || !mirror->ops) + return -EINVAL; + + mirror->hmm = hmm_register(mm); + if (!mirror->hmm) + return -ENOMEM; + + down_write(&mirror->hmm->mirrors_sem); + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + + return 0; +} +EXPORT_SYMBOL(hmm_mirror_register); + +/* + * hmm_mirror_unregister() - unregister a mirror + * + * @mirror: new mirror struct to register + * + * Stop mirroring a process address space, and cleanup. + */ +void hmm_mirror_unregister(struct hmm_mirror *mirror) +{ + struct hmm *hmm = mirror->hmm; + + down_write(&hmm->mirrors_sem); + list_del(&mirror->list); + up_write(&hmm->mirrors_sem); +} +EXPORT_SYMBOL(hmm_mirror_unregister); + +struct hmm_vma_walk { + struct hmm_range *range; + unsigned long last; + bool fault; + bool block; + bool write; +}; + +static int hmm_vma_do_fault(struct mm_walk *walk, + unsigned long addr, + hmm_pfn_t *pfn) +{ + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct vm_area_struct *vma = walk->vma; + int r; + + flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; + flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + r = handle_mm_fault(vma, addr, flags); + if (r & VM_FAULT_RETRY) + return -EBUSY; + if (r & VM_FAULT_ERROR) { + *pfn = HMM_PFN_ERROR; + return -EFAULT; + } + + return -EAGAIN; +} + +static void hmm_pfns_special(hmm_pfn_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = HMM_PFN_SPECIAL; +} + +static int hmm_pfns_bad(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_range *range = walk->private; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) + pfns[i] = HMM_PFN_ERROR; + + return 0; +} + +static void hmm_pfns_clear(hmm_pfn_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = 0; +} + +static int hmm_vma_walk_hole(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + hmm_vma_walk->last = addr; + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) { + pfns[i] = HMM_PFN_EMPTY; + if (hmm_vma_walk->fault) { + int ret; + + ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + if (ret != -EAGAIN) + return ret; + } + } + + return hmm_vma_walk->fault ? -EAGAIN : 0; +} + +static int hmm_vma_walk_clear(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + hmm_vma_walk->last = addr; + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) { + pfns[i] = 0; + if (hmm_vma_walk->fault) { + int ret; + + ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + if (ret != -EAGAIN) + return ret; + } + } + + return hmm_vma_walk->fault ? -EAGAIN : 0; +} + +static int hmm_vma_walk_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + hmm_pfn_t *pfns = range->pfns; + unsigned long addr = start, i; + bool write_fault; + hmm_pfn_t flag; + pte_t *ptep; + + i = (addr - range->start) >> PAGE_SHIFT; + flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; + write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; + +again: + if (pmd_none(*pmdp)) + return hmm_vma_walk_hole(start, end, walk); + + if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + return hmm_pfns_bad(start, end, walk); + + if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { + unsigned long pfn; + pmd_t pmd; + + /* + * No need to take pmd_lock here, even if some other threads + * is splitting the huge pmd we will get that event through + * mmu_notifier callback. + * + * So just read pmd value and check again its a transparent + * huge or device mapping one and compute corresponding pfn + * values. + */ + pmd = pmd_read_atomic(pmdp); + barrier(); + if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + goto again; + if (pmd_protnone(pmd)) + return hmm_vma_walk_clear(start, end, walk); + + if (write_fault && !pmd_write(pmd)) + return hmm_vma_walk_clear(start, end, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; + return 0; + } + + if (pmd_bad(*pmdp)) + return hmm_pfns_bad(start, end, walk); + + ptep = pte_offset_map(pmdp, addr); + for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + pte_t pte = *ptep; + + pfns[i] = 0; + + if (pte_none(pte)) { + pfns[i] = HMM_PFN_EMPTY; + if (hmm_vma_walk->fault) + goto fault; + continue; + } + + if (!pte_present(pte)) { + swp_entry_t entry; + + if (!non_swap_entry(entry)) { + if (hmm_vma_walk->fault) + goto fault; + continue; + } + + entry = pte_to_swp_entry(pte); + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); + if (is_write_device_private_entry(entry)) { + pfns[i] |= HMM_PFN_WRITE; + } else if (write_fault) + goto fault; + pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; + pfns[i] |= flag; + } else if (is_migration_entry(entry)) { + if (hmm_vma_walk->fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + continue; + } else { + /* Report error for everything else */ + pfns[i] = HMM_PFN_ERROR; + } + continue; + } + + if (write_fault && !pte_write(pte)) + goto fault; + + pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; + continue; + +fault: + pte_unmap(ptep); + /* Fault all pages in range */ + return hmm_vma_walk_clear(start, end, walk); + } + pte_unmap(ptep - 1); + + return 0; +} + +/* + * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses + * @vma: virtual memory area containing the virtual address range + * @range: used to track snapshot validity + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @entries: array of hmm_pfn_t: provided by the caller, filled in by function + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See hmm_vma_range_done() for further + * information. + * + * The range struct is initialized here. It tracks the CPU page table, but only + * if the function returns success (0), in which case the caller must then call + * hmm_vma_range_done() to stop CPU page table update tracking on this range. + * + * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS + * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! + */ +int hmm_vma_get_pfns(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns) +{ + struct hmm_vma_walk hmm_vma_walk; + struct mm_walk mm_walk; + struct hmm *hmm; + + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(pfns, start, end); + return -EINVAL; + } + + /* Sanity check, this really should not happen ! */ + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + hmm = hmm_register(vma->vm_mm); + if (!hmm) + return -ENOMEM; + /* Caller must have registered a mirror, via hmm_mirror_register() ! */ + if (!hmm->mmu_notifier.ops) + return -EINVAL; + + /* Initialize range to track CPU page table update */ + range->start = start; + range->pfns = pfns; + range->end = end; + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + walk_page_range(start, end, &mm_walk); + return 0; +} +EXPORT_SYMBOL(hmm_vma_get_pfns); + +/* + * hmm_vma_range_done() - stop tracking change to CPU page table over a range + * @vma: virtual memory area containing the virtual address range + * @range: range being tracked + * Returns: false if range data has been invalidated, true otherwise + * + * Range struct is used to track updates to the CPU page table after a call to + * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done + * using the data, or wants to lock updates to the data it got from those + * functions, it must call the hmm_vma_range_done() function, which will then + * stop tracking CPU page table updates. + * + * Note that device driver must still implement general CPU page table update + * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using + * the mmu_notifier API directly. + * + * CPU page table update tracking done through hmm_range is only temporary and + * to be used while trying to duplicate CPU page table contents for a range of + * virtual addresses. + * + * There are two ways to use this : + * again: + * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * trans = device_build_page_table_update_transaction(pfns); + * device_page_table_lock(); + * if (!hmm_vma_range_done(vma, range)) { + * device_page_table_unlock(); + * goto again; + * } + * device_commit_transaction(trans); + * device_page_table_unlock(); + * + * Or: + * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * device_page_table_lock(); + * hmm_vma_range_done(vma, range); + * device_update_page_table(pfns); + * device_page_table_unlock(); + */ +bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +{ + unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; + struct hmm *hmm; + + if (range->end <= range->start) { + BUG(); + return false; + } + + hmm = hmm_register(vma->vm_mm); + if (!hmm) { + memset(range->pfns, 0, sizeof(*range->pfns) * npages); + return false; + } + + spin_lock(&hmm->lock); + list_del_rcu(&range->list); + spin_unlock(&hmm->lock); + + return range->valid; +} +EXPORT_SYMBOL(hmm_vma_range_done); + +/* + * hmm_vma_fault() - try to fault some address in a virtual address range + * @vma: virtual memory area containing the virtual address range + * @range: use to track pfns array content validity + * @start: fault range virtual start address (inclusive) + * @end: fault range virtual end address (exclusive) + * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted + * @write: is it a write fault + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * + * This is similar to a regular CPU page fault except that it will not trigger + * any memory migration if the memory being faulted is not accessible by CPUs. + * + * On error, for one virtual address in the range, the function will set the + * hmm_pfn_t error flag for the corresponding pfn entry. + * + * Expected use pattern: + * retry: + * down_read(&mm->mmap_sem); + * // Find vma and address device wants to fault, initialize hmm_pfn_t + * // array accordingly + * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * switch (ret) { + * case -EAGAIN: + * hmm_vma_range_done(vma, range); + * // You might want to rate limit or yield to play nicely, you may + * // also commit any valid pfn in the array assuming that you are + * // getting true from hmm_vma_range_monitor_end() + * goto retry; + * case 0: + * break; + * default: + * // Handle error ! + * up_read(&mm->mmap_sem) + * return; + * } + * // Take device driver lock that serialize device page table update + * driver_lock_device_page_table_update(); + * hmm_vma_range_done(vma, range); + * // Commit pfns we got from hmm_vma_fault() + * driver_unlock_device_page_table_update(); + * up_read(&mm->mmap_sem) + * + * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) + * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! + * + * YOU HAVE BEEN WARNED ! + */ +int hmm_vma_fault(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns, + bool write, + bool block) +{ + struct hmm_vma_walk hmm_vma_walk; + struct mm_walk mm_walk; + struct hmm *hmm; + int ret; + + /* Sanity check, this really should not happen ! */ + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + hmm = hmm_register(vma->vm_mm); + if (!hmm) { + hmm_pfns_clear(pfns, start, end); + return -ENOMEM; + } + /* Caller must have registered a mirror using hmm_mirror_register() */ + if (!hmm->mmu_notifier.ops) + return -EINVAL; + + /* Initialize range to track CPU page table update */ + range->start = start; + range->pfns = pfns; + range->end = end; + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(pfns, start, end); + return 0; + } + + hmm_vma_walk.fault = true; + hmm_vma_walk.write = write; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + hmm_vma_walk.last = range->start; + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + } while (ret == -EAGAIN); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); + hmm_vma_range_done(vma, range); + } + return ret; +} +EXPORT_SYMBOL(hmm_vma_fault); +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ + + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + + page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (!page) + return NULL; + lock_page(page); + return page; +} +EXPORT_SYMBOL(hmm_vma_alloc_locked_page); + + +static void hmm_devmem_ref_release(struct percpu_ref *ref) +{ + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + complete(&devmem->completion); +} + +static void hmm_devmem_ref_exit(void *data) +{ + struct percpu_ref *ref = data; + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + percpu_ref_exit(ref); + devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); +} + +static void hmm_devmem_ref_kill(void *data) +{ + struct percpu_ref *ref = data; + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + percpu_ref_kill(ref); + wait_for_completion(&devmem->completion); + devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); +} + +static int hmm_devmem_fault(struct vm_area_struct *vma, + unsigned long addr, + const struct page *page, + unsigned int flags, + pmd_t *pmdp) +{ + struct hmm_devmem *devmem = page->pgmap->data; + + return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); +} + +static void hmm_devmem_free(struct page *page, void *data) +{ + struct hmm_devmem *devmem = data; + + devmem->ops->free(devmem, page); +} + +static DEFINE_MUTEX(hmm_devmem_lock); +static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); + +static void hmm_devmem_radix_release(struct resource *resource) +{ + resource_size_t key, align_start, align_size, align_end; + + align_start = resource->start & ~(PA_SECTION_SIZE - 1); + align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); + align_end = align_start + align_size - 1; + + mutex_lock(&hmm_devmem_lock); + for (key = resource->start; + key <= resource->end; + key += PA_SECTION_SIZE) + radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&hmm_devmem_lock); +} + +static void hmm_devmem_release(struct device *dev, void *data) +{ + struct hmm_devmem *devmem = data; + struct resource *resource = devmem->resource; + unsigned long start_pfn, npages; + struct zone *zone; + struct page *page; + + if (percpu_ref_tryget_live(&devmem->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(&devmem->ref); + } + + /* pages are dead and unused, undo the arch mapping */ + start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; + npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; + + page = pfn_to_page(start_pfn); + zone = page_zone(page); + + mem_hotplug_begin(); + if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) + __remove_pages(zone, start_pfn, npages); + else + arch_remove_memory(start_pfn << PAGE_SHIFT, + npages << PAGE_SHIFT); + mem_hotplug_done(); + + hmm_devmem_radix_release(resource); +} + +static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); +} + +static int hmm_devmem_pages_create(struct hmm_devmem *devmem) +{ + resource_size_t key, align_start, align_size, align_end; + struct device *device = devmem->device; + int ret, nid, is_ram; + unsigned long pfn; + + align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); + align_size = ALIGN(devmem->resource->start + + resource_size(devmem->resource), + PA_SECTION_SIZE) - align_start; + + is_ram = region_intersects(align_start, align_size, + IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE); + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, devmem->resource); + return -ENXIO; + } + if (is_ram == REGION_INTERSECTS) + return -ENXIO; + + if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) + devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; + else + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + + devmem->pagemap.res = devmem->resource; + devmem->pagemap.page_fault = hmm_devmem_fault; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.dev = devmem->device; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; + + mutex_lock(&hmm_devmem_lock); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { + struct hmm_devmem *dup; + + rcu_read_lock(); + dup = hmm_devmem_find(key); + rcu_read_unlock(); + if (dup) { + dev_err(device, "%s: collides with mapping for %s\n", + __func__, dev_name(dup->device)); + mutex_unlock(&hmm_devmem_lock); + ret = -EBUSY; + goto error; + } + ret = radix_tree_insert(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT, + devmem); + if (ret) { + dev_err(device, "%s: failed: %d\n", __func__, ret); + mutex_unlock(&hmm_devmem_lock); + goto error_radix; + } + } + mutex_unlock(&hmm_devmem_lock); + + nid = dev_to_node(device); + if (nid < 0) + nid = numa_mem_id(); + + mem_hotplug_begin(); + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For device public memory, which is accesible by the CPU, we do + * want the linear mapping and thus use arch_add_memory(). + */ + if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) + ret = arch_add_memory(nid, align_start, align_size, false); + else + ret = add_pages(nid, align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, false); + if (ret) { + mem_hotplug_done(); + goto error_add_memory; + } + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); + mem_hotplug_done(); + + for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { + struct page *page = pfn_to_page(pfn); + + page->pgmap = &devmem->pagemap; + } + return 0; + +error_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); +error_radix: + hmm_devmem_radix_release(devmem->resource); +error: + return ret; +} + +static int hmm_devmem_match(struct device *dev, void *data, void *match_data) +{ + struct hmm_devmem *devmem = data; + + return devmem->resource == match_data; +} + +static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) +{ + devres_release(devmem->device, &hmm_devmem_release, + &hmm_devmem_match, devmem->resource); +} + +/* + * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory + * + * @ops: memory event device driver callback (see struct hmm_devmem_ops) + * @device: device struct to bind the resource too + * @size: size in bytes of the device memory to add + * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise + * + * This function first finds an empty range of physical address big enough to + * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which + * in turn allocates struct pages. It does not do anything beyond that; all + * events affecting the memory will go through the various callbacks provided + * by hmm_devmem_ops struct. + * + * Device driver should call this function during device initialization and + * is then responsible of memory management. HMM only provides helpers. + */ +struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, + struct device *device, + unsigned long size) +{ + struct hmm_devmem *devmem; + resource_size_t addr; + int ret; + + static_branch_enable(&device_private_key); + + devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), + GFP_KERNEL, dev_to_node(device)); + if (!devmem) + return ERR_PTR(-ENOMEM); + + init_completion(&devmem->completion); + devmem->pfn_first = -1UL; + devmem->pfn_last = -1UL; + devmem->resource = NULL; + devmem->device = device; + devmem->ops = ops; + + ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, + 0, GFP_KERNEL); + if (ret) + goto error_percpu_ref; + + ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + if (ret) + goto error_devm_add_action; + + size = ALIGN(size, PA_SECTION_SIZE); + addr = min((unsigned long)iomem_resource.end, + (1UL << MAX_PHYSMEM_BITS) - 1); + addr = addr - size + 1UL; + + /* + * FIXME add a new helper to quickly walk resource tree and find free + * range + * + * FIXME what about ioport_resource resource ? + */ + for (; addr > size && addr >= iomem_resource.start; addr -= size) { + ret = region_intersects(addr, size, 0, IORES_DESC_NONE); + if (ret != REGION_DISJOINT) + continue; + + devmem->resource = devm_request_mem_region(device, addr, size, + dev_name(device)); + if (!devmem->resource) { + ret = -ENOMEM; + goto error_no_resource; + } + break; + } + if (!devmem->resource) { + ret = -ERANGE; + goto error_no_resource; + } + + devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; + devmem->pfn_last = devmem->pfn_first + + (resource_size(devmem->resource) >> PAGE_SHIFT); + + ret = hmm_devmem_pages_create(devmem); + if (ret) + goto error_pages; + + devres_add(device, devmem); + + ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) { + hmm_devmem_remove(devmem); + return ERR_PTR(ret); + } + + return devmem; + +error_pages: + devm_release_mem_region(device, devmem->resource->start, + resource_size(devmem->resource)); +error_no_resource: +error_devm_add_action: + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); +error_percpu_ref: + devres_free(devmem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(hmm_devmem_add); + +struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, + struct device *device, + struct resource *res) +{ + struct hmm_devmem *devmem; + int ret; + + if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) + return ERR_PTR(-EINVAL); + + static_branch_enable(&device_private_key); + + devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), + GFP_KERNEL, dev_to_node(device)); + if (!devmem) + return ERR_PTR(-ENOMEM); + + init_completion(&devmem->completion); + devmem->pfn_first = -1UL; + devmem->pfn_last = -1UL; + devmem->resource = res; + devmem->device = device; + devmem->ops = ops; + + ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, + 0, GFP_KERNEL); + if (ret) + goto error_percpu_ref; + + ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + if (ret) + goto error_devm_add_action; + + + devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; + devmem->pfn_last = devmem->pfn_first + + (resource_size(devmem->resource) >> PAGE_SHIFT); + + ret = hmm_devmem_pages_create(devmem); + if (ret) + goto error_devm_add_action; + + devres_add(device, devmem); + + ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) { + hmm_devmem_remove(devmem); + return ERR_PTR(ret); + } + + return devmem; + +error_devm_add_action: + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); +error_percpu_ref: + devres_free(devmem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(hmm_devmem_add_resource); + +/* + * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) + * + * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory + * + * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf + * of the device driver. It will free struct page and remove the resource that + * reserved the physical address range for this device memory. + */ +void hmm_devmem_remove(struct hmm_devmem *devmem) +{ + resource_size_t start, size; + struct device *device; + bool cdm = false; + + if (!devmem) + return; + + device = devmem->device; + start = devmem->resource->start; + size = resource_size(devmem->resource); + + cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); + hmm_devmem_pages_remove(devmem); + + if (!cdm) + devm_release_mem_region(device, start, size); +} +EXPORT_SYMBOL(hmm_devmem_remove); + +/* + * A device driver that wants to handle multiple devices memory through a + * single fake device can use hmm_device to do so. This is purely a helper + * and it is not needed to make use of any HMM functionality. + */ +#define HMM_DEVICE_MAX 256 + +static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); +static DEFINE_SPINLOCK(hmm_device_lock); +static struct class *hmm_device_class; +static dev_t hmm_device_devt; + +static void hmm_device_release(struct device *device) +{ + struct hmm_device *hmm_device; + + hmm_device = container_of(device, struct hmm_device, device); + spin_lock(&hmm_device_lock); + clear_bit(hmm_device->minor, hmm_device_mask); + spin_unlock(&hmm_device_lock); + + kfree(hmm_device); +} + +struct hmm_device *hmm_device_new(void *drvdata) +{ + struct hmm_device *hmm_device; + + hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); + if (!hmm_device) + return ERR_PTR(-ENOMEM); + + spin_lock(&hmm_device_lock); + hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); + if (hmm_device->minor >= HMM_DEVICE_MAX) { + spin_unlock(&hmm_device_lock); + kfree(hmm_device); + return ERR_PTR(-EBUSY); + } + set_bit(hmm_device->minor, hmm_device_mask); + spin_unlock(&hmm_device_lock); + + dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); + hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), + hmm_device->minor); + hmm_device->device.release = hmm_device_release; + dev_set_drvdata(&hmm_device->device, drvdata); + hmm_device->device.class = hmm_device_class; + device_initialize(&hmm_device->device); + + return hmm_device; +} +EXPORT_SYMBOL(hmm_device_new); + +void hmm_device_put(struct hmm_device *hmm_device) +{ + put_device(&hmm_device->device); +} +EXPORT_SYMBOL(hmm_device_put); + +static int __init hmm_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&hmm_device_devt, 0, + HMM_DEVICE_MAX, + "hmm_device"); + if (ret) + return ret; + + hmm_device_class = class_create(THIS_MODULE, "hmm_device"); + if (IS_ERR(hmm_device_class)) { + unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); + return PTR_ERR(hmm_device_class); + } + return 0; +} + +device_initcall(hmm_init); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0b51e70e0a8b..269b5df58543 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -928,6 +928,25 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (unlikely(is_swap_pmd(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + if (is_write_migration_entry(entry)) { + make_migration_entry_read(&entry); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } +#endif + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; @@ -1599,6 +1618,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) goto out; + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto out; + } + page = pmd_page(orig_pmd); /* * If other processes are mapping this page, we couldn't discard @@ -1684,10 +1709,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { - struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page, true); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(!PageHead(page), page); + struct page *page = NULL; + int flush_needed = 1; + + if (pmd_present(orig_pmd)) { + page = pmd_page(orig_pmd); + page_remove_rmap(page, true); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + VM_BUG_ON_PAGE(!PageHead(page), page); + } else if (thp_migration_supported()) { + swp_entry_t entry; + + VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); + entry = pmd_to_swp_entry(orig_pmd); + page = pfn_to_page(swp_offset(entry)); + flush_needed = 0; + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + if (PageAnon(page)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); @@ -1696,8 +1735,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } + spin_unlock(ptl); - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + if (flush_needed) + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; } @@ -1717,6 +1758,17 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, } #endif +static pmd_t move_soft_dirty_pmd(pmd_t pmd) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (unlikely(is_pmd_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); +#endif + return pmd; +} + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1759,7 +1811,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + pmd = move_soft_dirty_pmd(pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd); if (new_ptl != old_ptl) spin_unlock(new_ptl); if (force_flush) @@ -1794,6 +1847,27 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, preserve_write = prot_numa && pmd_write(*pmd); ret = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (is_swap_pmd(*pmd)) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + if (is_write_migration_entry(entry)) { + pmd_t newpmd; + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); + set_pmd_at(mm, addr, pmd, newpmd); + } + goto unlock; + } +#endif + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1859,7 +1933,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || + pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -1977,14 +2052,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty, soft_dirty; + bool young, write, dirty, soft_dirty, pmd_migration = false; unsigned long addr; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2009,7 +2085,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - page = pmd_page(*pmd); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + pmd_migration = is_pmd_migration_entry(*pmd); + if (pmd_migration) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(*pmd); + page = pfn_to_page(swp_offset(entry)); + } else +#endif + page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); @@ -2028,7 +2113,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * transferred to avoid any possibility of altering * permissions across VMAs. */ - if (freeze) { + if (freeze || pmd_migration) { swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); @@ -2127,7 +2212,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); - } else if (!pmd_devmap(*pmd)) + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: @@ -2210,7 +2295,7 @@ static void freeze_page(struct page *page) VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) - ttu_flags |= TTU_MIGRATION; + ttu_flags |= TTU_SPLIT_FREEZE; unmap_success = try_to_unmap(page, ttu_flags); VM_BUG_ON_PAGE(!unmap_success, page); @@ -2745,3 +2830,66 @@ static int __init split_huge_pages_debugfs(void) } late_initcall(split_huge_pages_debugfs); #endif + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + pmd_t pmdval; + swp_entry_t entry; + pmd_t pmdswp; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + mmu_notifier_invalidate_range_start(mm, address, + address + HPAGE_PMD_SIZE); + + flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); + pmdval = *pvmw->pmd; + pmdp_invalidate(vma, address, pvmw->pmd); + if (pmd_dirty(pmdval)) + set_page_dirty(page); + entry = make_migration_entry(page, pmd_write(pmdval)); + pmdswp = swp_entry_to_pmd(entry); + if (pmd_soft_dirty(pmdval)) + pmdswp = pmd_swp_mksoft_dirty(pmdswp); + set_pmd_at(mm, address, pvmw->pmd, pmdswp); + page_remove_rmap(page, true); + put_page(page); + + mmu_notifier_invalidate_range_end(mm, address, + address + HPAGE_PMD_SIZE); +} + +void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + unsigned long mmun_start = address & HPAGE_PMD_MASK; + pmd_t pmde; + swp_entry_t entry; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + entry = pmd_to_swp_entry(*pvmw->pmd); + get_page(new); + pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_mksoft_dirty(pmde); + if (is_write_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + + flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); + page_add_anon_rmap(new, vma, mmun_start, true); + set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(new); + update_mmu_cache_pmd(vma, address, pvmw->pmd); +} +#endif diff --git a/mm/interval_tree.c b/mm/interval_tree.c index f2c2492681bf..b47664358796 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -28,7 +28,7 @@ INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, - struct rb_root *root) + struct rb_root_cached *root) { struct rb_node **link; struct vm_area_struct *parent; @@ -55,7 +55,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); - rb_insert_augmented(&node->shared.rb, root, + rb_insert_augmented(&node->shared.rb, &root->rb_root, &vma_interval_tree_augment); } @@ -74,7 +74,7 @@ INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, - struct rb_root *root) + struct rb_root_cached *root) { #ifdef CONFIG_DEBUG_VM_RB node->cached_vma_start = avc_start_pgoff(node); @@ -84,13 +84,13 @@ void anon_vma_interval_tree_insert(struct anon_vma_chain *node, } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, - struct rb_root *root) + struct rb_root_cached *root) { __anon_vma_interval_tree_remove(node, root); } struct anon_vma_chain * -anon_vma_interval_tree_iter_first(struct rb_root *root, +anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_first(root, first, last); diff --git a/mm/madvise.c b/mm/madvise.c index eea1c733286f..21261ff0466f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (!page) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6532b219b222..15af3da5af02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -119,6 +119,7 @@ static const char *const mem_cgroup_lru_names[] = { struct mem_cgroup_tree_per_node { struct rb_root rb_root; + struct rb_node *rb_rightmost; spinlock_t lock; }; @@ -386,6 +387,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; struct mem_cgroup_per_node *mz_node; + bool rightmost = true; if (mz->on_tree) return; @@ -397,8 +399,11 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, parent = *p; mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) + if (mz->usage_in_excess < mz_node->usage_in_excess) { p = &(*p)->rb_left; + rightmost = false; + } + /* * We can't avoid mem cgroups that are over their soft * limit by the same amount @@ -406,6 +411,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, else if (mz->usage_in_excess >= mz_node->usage_in_excess) p = &(*p)->rb_right; } + + if (rightmost) + mctz->rb_rightmost = &mz->tree_node; + rb_link_node(&mz->tree_node, parent, p); rb_insert_color(&mz->tree_node, &mctz->rb_root); mz->on_tree = true; @@ -416,6 +425,10 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, { if (!mz->on_tree) return; + + if (&mz->tree_node == mctz->rb_rightmost) + mctz->rb_rightmost = rb_prev(&mz->tree_node); + rb_erase(&mz->tree_node, &mctz->rb_root); mz->on_tree = false; } @@ -496,16 +509,15 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) static struct mem_cgroup_per_node * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { - struct rb_node *rightmost = NULL; struct mem_cgroup_per_node *mz; retry: mz = NULL; - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) + if (!mctz->rb_rightmost) goto done; /* Nothing to reclaim from */ - mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); + mz = rb_entry(mctz->rb_rightmost, + struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct @@ -1792,6 +1804,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; + if (stock->nr_pages > CHARGE_BATCH) + drain_stock(stock); + local_irq_restore(flags); } @@ -4414,12 +4429,13 @@ enum mc_target_type { MC_TARGET_NONE = 0, MC_TARGET_PAGE, MC_TARGET_SWAP, + MC_TARGET_DEVICE, }; static struct page *mc_handle_present_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { - struct page *page = vm_normal_page(vma, addr, ptent); + struct page *page = _vm_normal_page(vma, addr, ptent, true); if (!page || !page_mapped(page)) return NULL; @@ -4436,7 +4452,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return page; } -#ifdef CONFIG_SWAP +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, pte_t ptent, swp_entry_t *entry) { @@ -4445,6 +4461,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) return NULL; + + /* + * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to + * a device and because they are not accessible by CPU they are store + * as special swap entry in the CPU page table. + */ + if (is_device_private_entry(ent)) { + page = device_private_entry_to_page(ent); + /* + * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have + * a refcount of 1 when free (unlike normal page) + */ + if (!page_ref_add_unless(page, 1, 1)) + return NULL; + return page; + } + /* * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. @@ -4605,6 +4638,13 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. + * + * See Documentations/vm/hmm.txt and include/linux/hmm.h * * Called with pte lock held. */ @@ -4633,6 +4673,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; + if (is_device_private_page(page) || + is_device_public_page(page)) + ret = MC_TARGET_DEVICE; if (target) target->page = page; } @@ -4664,6 +4707,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, struct page *page = NULL; enum mc_target_type ret = MC_TARGET_NONE; + if (unlikely(is_swap_pmd(pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(pmd)); + return ret; + } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) @@ -4695,6 +4743,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { + /* + * Note their can not be MC_TARGET_DEVICE for now as we do not + * support transparent huge page with MEMORY_DEVICE_PUBLIC or + * MEMORY_DEVICE_PRIVATE but this might change. + */ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4910,6 +4963,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, putback_lru_page(page); } put_page(page); + } else if (target_type == MC_TARGET_DEVICE) { + page = target.page; + if (!mem_cgroup_move_account(page, true, + mc.from, mc.to)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + put_page(page); } spin_unlock(ptl); return 0; @@ -4921,12 +4982,16 @@ retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); + bool device = false; swp_entry_t ent; if (!mc.precharge) break; switch (get_mctgt_type(vma, addr, ptent, &target)) { + case MC_TARGET_DEVICE: + device = true; + /* fall through */ case MC_TARGET_PAGE: page = target.page; /* @@ -4937,7 +5002,7 @@ retry: */ if (PageTransCompound(page)) goto put; - if (isolate_lru_page(page)) + if (!device && isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, mc.from, mc.to)) { @@ -4945,7 +5010,8 @@ retry: /* we uncharge from mc.from later. */ mc.moved_charge++; } - putback_lru_page(page); + if (!device) + putback_lru_page(page); put: /* get_mctgt_type() gets the page */ put_page(page); break; @@ -5535,48 +5601,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, cancel_charge(memcg, nr_pages); } -static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, - unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_kmem, unsigned long nr_huge, - unsigned long nr_shmem, struct page *dummy_page) +struct uncharge_gather { + struct mem_cgroup *memcg; + unsigned long pgpgout; + unsigned long nr_anon; + unsigned long nr_file; + unsigned long nr_kmem; + unsigned long nr_huge; + unsigned long nr_shmem; + struct page *dummy_page; +}; + +static inline void uncharge_gather_clear(struct uncharge_gather *ug) +{ + memset(ug, 0, sizeof(*ug)); +} + +static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = nr_anon + nr_file + nr_kmem; + unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; - if (!mem_cgroup_is_root(memcg)) { - page_counter_uncharge(&memcg->memory, nr_pages); + if (!mem_cgroup_is_root(ug->memcg)) { + page_counter_uncharge(&ug->memcg->memory, nr_pages); if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) - page_counter_uncharge(&memcg->kmem, nr_kmem); - memcg_oom_recover(memcg); + page_counter_uncharge(&ug->memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) + page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); - __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); - __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - memcg_check_events(memcg, dummy_page); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); + __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); + __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); + __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); - if (!mem_cgroup_is_root(memcg)) - css_put_many(&memcg->css, nr_pages); + if (!mem_cgroup_is_root(ug->memcg)) + css_put_many(&ug->memcg->css, nr_pages); +} + +static void uncharge_page(struct page *page, struct uncharge_gather *ug) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); + + if (!page->mem_cgroup) + return; + + /* + * Nobody should be changing or seriously looking at + * page->mem_cgroup at this point, we have fully + * exclusive access to the page. + */ + + if (ug->memcg != page->mem_cgroup) { + if (ug->memcg) { + uncharge_batch(ug); + uncharge_gather_clear(ug); + } + ug->memcg = page->mem_cgroup; + } + + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + ug->nr_huge += nr_pages; + } + if (PageAnon(page)) + ug->nr_anon += nr_pages; + else { + ug->nr_file += nr_pages; + if (PageSwapBacked(page)) + ug->nr_shmem += nr_pages; + } + ug->pgpgout++; + } else { + ug->nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } + + ug->dummy_page = page; + page->mem_cgroup = NULL; } static void uncharge_list(struct list_head *page_list) { - struct mem_cgroup *memcg = NULL; - unsigned long nr_shmem = 0; - unsigned long nr_anon = 0; - unsigned long nr_file = 0; - unsigned long nr_huge = 0; - unsigned long nr_kmem = 0; - unsigned long pgpgout = 0; + struct uncharge_gather ug; struct list_head *next; - struct page *page; + + uncharge_gather_clear(&ug); /* * Note that the list can be a single page->lru; hence the @@ -5584,57 +5704,16 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { + struct page *page; + page = list_entry(next, struct page, lru); next = page->lru.next; - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); - - if (!page->mem_cgroup) - continue; - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully - * exclusive access to the page. - */ - - if (memcg != page->mem_cgroup) { - if (memcg) { - uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_kmem, nr_huge, nr_shmem, page); - pgpgout = nr_anon = nr_file = nr_kmem = 0; - nr_huge = nr_shmem = 0; - } - memcg = page->mem_cgroup; - } - - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - nr_huge += nr_pages; - } - if (PageAnon(page)) - nr_anon += nr_pages; - else { - nr_file += nr_pages; - if (PageSwapBacked(page)) - nr_shmem += nr_pages; - } - pgpgout++; - } else { - nr_kmem += 1 << compound_order(page); - __ClearPageKmemcg(page); - } - - page->mem_cgroup = NULL; + uncharge_page(page, &ug); } while (next != page_list); - if (memcg) - uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_kmem, nr_huge, nr_shmem, page); + if (ug.memcg) + uncharge_batch(&ug); } /** @@ -5646,6 +5725,8 @@ static void uncharge_list(struct list_head *page_list) */ void mem_cgroup_uncharge(struct page *page) { + struct uncharge_gather ug; + if (mem_cgroup_disabled()) return; @@ -5653,8 +5734,9 @@ void mem_cgroup_uncharge(struct page *page) if (!page->mem_cgroup) return; - INIT_LIST_HEAD(&page->lru); - uncharge_list(&page->lru); + uncharge_gather_clear(&ug); + uncharge_page(page, &ug); + uncharge_batch(&ug); } /** @@ -5819,8 +5901,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); - page_counter_uncharge(&memcg->memory, nr_pages); - css_put_many(&memcg->css, nr_pages); + refill_stock(memcg, nr_pages); } static int __init cgroup_memory(char *s) @@ -5876,6 +5957,7 @@ static int __init mem_cgroup_init(void) node_online(node) ? node : NUMA_NO_NODE); rtpn->rb_root = RB_ROOT; + rtpn->rb_rightmost = NULL; spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; } diff --git a/mm/memory.c b/mm/memory.c index 13ee83b43878..ec4e15494901 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -49,6 +49,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/memremap.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> @@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); @@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_zero_pfn(pfn)) - print_bad_pte(vma, addr, pte, NULL); + if (is_zero_pfn(pfn)) + return NULL; + + /* + * Device public pages are special pages (they are ZONE_DEVICE + * pages but different from persistent memory). They behave + * allmost like normal pages. The difference is that they are + * not on the lru and thus should never be involve with any- + * thing that involve lru manipulation (mlock, numa balancing, + * ...). + * + * This is why we still want to return NULL for such page from + * vm_normal_page() so that we do not have to special case all + * call site of vm_normal_page(). + */ + if (likely(pfn < highest_memmap_pfn)) { + struct page *page = pfn_to_page(pfn); + + if (is_device_public_page(page)) { + if (with_public_device) + return page; + return NULL; + } + } + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } + } else if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); + + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + page_dup_rmap(page, false); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_write_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + make_device_private_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + } else if (pte_devmap(pte)) { + page = pte_page(pte); + + /* + * Cache coherent device memory behave like regular page and + * not like persistent memory page. For more informations see + * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h + */ + if (is_device_public_page(page)) { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } } out_set_pte: @@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, @@ -1236,7 +1303,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -1273,6 +1340,29 @@ again: } continue; } + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry) && is_device_private_entry(entry)) { + struct page *page = device_private_entry_to_page(entry); + + if (unlikely(details && details->check_mapping)) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping != + page_rmapping(page)) + continue; + } + + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); + put_page(page); + continue; + } + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; @@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); @@ -2671,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct rb_root *root, +static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; @@ -2735,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } @@ -2775,6 +2865,14 @@ int do_swap_page(struct vm_fault *vmf) if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); + } else if (is_device_private_entry(entry)) { + /* + * For un-addressable device memory we call the pgmap + * fault handler callback. The callback must migrate + * the page back to some CPU accessible page. + */ + ret = device_private_entry_fault(vma, vmf->address, entry, + vmf->flags, vmf->pmd); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { @@ -3863,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; + unsigned int dirty = flags & FAULT_FLAG_WRITE; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; @@ -3885,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { - unsigned int dirty = flags & FAULT_FLAG_WRITE; /* NUMA case for anonymous PUDs would go here */ @@ -3911,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pmd_t orig_pmd = *vmf.pmd; barrier(); + if (unlikely(is_swap_pmd(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + if (is_pmd_migration_entry(orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((vmf.flags & FAULT_FLAG_WRITE) && - !pmd_write(orig_pmd)) { + if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -3949,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -3956,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_oom_enable(); - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - flags & FAULT_FLAG_INSTRUCTION, - flags & FAULT_FLAG_REMOTE)) - return VM_FAULT_SIGSEGV; - if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 73bf17df6899..e882cb6da994 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -99,7 +99,7 @@ void mem_hotplug_done(void) /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { - struct resource *res; + struct resource *res, *conflict; res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); @@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->start = start; res->end = start + size - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res) < 0) { + conflict = request_resource_conflict(&iomem_resource, res); + if (conflict) { + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_debug("Device unaddressable memory block " + "memory hotplug at %#010llx !\n", + (unsigned long long)start); + } pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); return ERR_PTR(-EEXIST); @@ -1380,7 +1386,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } + } else if (thp_migration_supported() && PageTransHuge(page)) + pfn = page_to_pfn(compound_head(page)) + + hpage_nr_pages(page) - 1; if (!get_page_unless_zero(page)) continue; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 618ab125228b..006ba625c0b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,6 +97,7 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> +#include <linux/swapops.h> #include <asm/tlbflush.h> #include <linux/uaccess.h> @@ -412,6 +413,64 @@ struct queue_pages { }; /* + * Check if the page's nid is in qp->nmask. + * + * If MPOL_MF_INVERT is set in qp->flags, check if the nid is + * in the invert of qp->nmask. + */ +static inline bool queue_pages_required(struct page *page, + struct queue_pages *qp) +{ + int nid = page_to_nid(page); + unsigned long flags = qp->flags; + + return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); +} + +static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + int ret = 0; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags; + + if (unlikely(is_pmd_migration_entry(*pmd))) { + ret = 1; + goto unlock; + } + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + __split_huge_pmd(walk->vma, pmd, addr, false, NULL); + goto out; + } + if (!thp_migration_supported()) { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + goto out; + } + if (!queue_pages_required(page, qp)) { + ret = 1; + goto unlock; + } + + ret = 1; + flags = qp->flags; + /* go to thp migration */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, qp->pagelist, flags); +unlock: + spin_unlock(ptl); +out: + return ret; +} + +/* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. */ @@ -422,30 +481,15 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid, ret; + int ret; pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge(*pmd)) { - ptl = pmd_lock(walk->mm, pmd); - if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - __split_huge_pmd(vma, pmd, addr, false, NULL); - } else { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return 0; - } - } else { - spin_unlock(ptl); - } + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + ret = queue_pages_pmd(pmd, ptl, addr, end, walk); + if (ret) + return 0; } if (pmd_trans_unstable(pmd)) @@ -464,10 +508,9 @@ retry: */ if (PageReserved(page)) continue; - nid = page_to_nid(page); - if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page)) { + if (PageTransCompound(page) && !thp_migration_supported()) { get_page(page); pte_unmap_unlock(pte, ptl); lock_page(page); @@ -497,7 +540,6 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid; struct page *page; spinlock_t *ptl; pte_t entry; @@ -507,8 +549,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto unlock; page = pte_page(entry); - nid = page_to_nid(page); - if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + if (!queue_pages_required(page, qp)) goto unlock; /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || @@ -881,19 +922,21 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, #ifdef CONFIG_MIGRATION /* - * page migration + * page migration, thp tail pages can be passed. */ static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { + struct page *head = compound_head(page); /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (!isolate_lru_page(page)) { - list_add_tail(&page->lru, pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { + if (!isolate_lru_page(head)) { + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); } } } @@ -903,7 +946,17 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else + else if (thp_migration_supported() && PageTransHuge(page)) { + struct page *thp; + + thp = alloc_pages_node(node, + (GFP_TRANSHUGE | __GFP_THISNODE), + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } else return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1069,6 +1122,15 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) if (PageHuge(page)) { BUG_ON(!vma); return alloc_huge_page_noerr(vma, address, 1); + } else if (thp_migration_supported() && PageTransHuge(page)) { + struct page *thp; + + thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; } /* * if !vma, alloc_page_vma() will use task or system default policy @@ -1683,8 +1745,7 @@ unsigned int mempolicy_slab_node(void) * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the * number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long n) +static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; @@ -1717,7 +1778,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol, BUG_ON(shift < PAGE_SHIFT); off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, vma, off); + return offset_il_node(pol, off); } else return interleave_nodes(pol); } @@ -2172,20 +2233,15 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long int polnid = -1; int ret = -1; - BUG_ON(!vma); - pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); - pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, vma, pgoff); + polnid = offset_il_node(pol, pgoff); break; case MPOL_PREFERRED: diff --git a/mm/migrate.c b/mm/migrate.c index e84eeb4e4356..6954c1435833 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,9 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/pfn_t.h> +#include <linux/memremap.h> +#include <linux/userfaultfd_k.h> #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> @@ -185,8 +188,8 @@ void putback_movable_pages(struct list_head *l) unlock_page(page); put_page(page); } else { - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_cache(page), -hpage_nr_pages(page)); putback_lru_page(page); } } @@ -216,6 +219,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, new = page - pvmw.page->index + linear_page_index(vma, pvmw.address); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + remove_migration_pmd(&pvmw, new); + continue; + } +#endif + get_page(new); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*pvmw.pte)) @@ -228,7 +240,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - flush_dcache_page(new); + if (unlikely(is_zone_device_page(new))) { + if (is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else if (is_device_public_page(new)) { + pte = pte_mkdevmap(pte); + flush_dcache_page(new); + } + } else + flush_dcache_page(new); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -330,6 +352,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, __migration_entry_wait(mm, pte, ptl); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl; + struct page *page; + + ptl = pmd_lock(mm, pmd); + if (!is_pmd_migration_entry(*pmd)) + goto unlock; + page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); + if (!get_page_unless_zero(page)) + goto unlock; + spin_unlock(ptl); + wait_on_page_locked(page); + put_page(page); + return; +unlock: + spin_unlock(ptl); +} +#endif + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, @@ -398,6 +441,13 @@ int migrate_page_move_mapping(struct address_space *mapping, int expected_count = 1 + extra_count; void **pslot; + /* + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. + */ + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); + if (!mapping) { /* Anonymous page without mapping */ if (page_count(page) != expected_count) @@ -604,15 +654,10 @@ static void copy_huge_page(struct page *dst, struct page *src) /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_states(struct page *newpage, struct page *page) { int cpupid; - if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); - else - copy_highpage(newpage, page); - if (PageError(page)) SetPageError(newpage); if (PageReferenced(page)) @@ -666,6 +711,17 @@ void migrate_page_copy(struct page *newpage, struct page *page) mem_cgroup_migrate(page, newpage); } +EXPORT_SYMBOL(migrate_page_states); + +void migrate_page_copy(struct page *newpage, struct page *page) +{ + if (PageHuge(page) || PageTransHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); + + migrate_page_states(newpage, page); +} EXPORT_SYMBOL(migrate_page_copy); /************************************************************ @@ -691,7 +747,10 @@ int migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -741,12 +800,15 @@ int buffer_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); bh = head; do { unlock_buffer(bh); - put_bh(bh); + put_bh(bh); bh = bh->b_this_page; } while (bh != head); @@ -805,8 +867,13 @@ static int fallback_migrate_page(struct address_space *mapping, { if (PageDirty(page)) { /* Only writeback pages in full synchronous migration */ - if (mode != MIGRATE_SYNC) + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: return -EBUSY; + } return writeout(mapping, page); } @@ -943,7 +1010,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ - if (mode != MIGRATE_SYNC) { + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: rc = -EBUSY; goto out_unlock; } @@ -1088,7 +1159,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) { + if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { lock_page(page); rc = split_huge_page(page); unlock_page(page); @@ -1116,8 +1187,8 @@ out: * as __PageMovable */ if (likely(!__PageMovable(page))) - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_cache(page), -hpage_nr_pages(page)); } /* @@ -1213,8 +1284,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; if (!trylock_page(hpage)) { - if (!force || mode != MIGRATE_SYNC) + if (!force) + goto out; + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: goto out; + } lock_page(hpage); } @@ -1391,7 +1469,17 @@ static struct page *new_page_node(struct page *p, unsigned long private, if (PageHuge(p)) return alloc_huge_page_node(page_hstate(compound_head(p)), pm->node); - else + else if (thp_migration_supported() && PageTransHuge(p)) { + struct page *thp; + + thp = alloc_pages_node(pm->node, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } else return __alloc_pages_node(pm->node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1418,6 +1506,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, for (pp = pm; pp->node != MAX_NUMNODES; pp++) { struct vm_area_struct *vma; struct page *page; + struct page *head; + unsigned int follflags; err = -EFAULT; vma = find_vma(mm, pp->addr); @@ -1425,8 +1515,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, pp->addr, - FOLL_GET | FOLL_SPLIT | FOLL_DUMP); + follflags = FOLL_GET | FOLL_DUMP; + if (!thp_migration_supported()) + follflags |= FOLL_SPLIT; + page = follow_page(vma, pp->addr, follflags); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1436,7 +1528,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - pp->page = page; err = page_to_nid(page); if (err == pp->node) @@ -1451,16 +1542,22 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; if (PageHuge(page)) { - if (PageHead(page)) + if (PageHead(page)) { isolate_huge_page(page, &pagelist); + err = 0; + pp->page = page; + } goto put_and_set; } - err = isolate_lru_page(page); + pp->page = compound_head(page); + head = compound_head(page); + err = isolate_lru_page(head); if (!err) { - list_add_tail(&page->lru, &pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + list_add_tail(&head->lru, &pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); } put_and_set: /* @@ -2029,3 +2126,859 @@ out_unlock: #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ + +#if defined(CONFIG_MIGRATE_VMA_HELPER) +struct migrate_vma { + struct vm_area_struct *vma; + unsigned long *dst; + unsigned long *src; + unsigned long cpages; + unsigned long npages; + unsigned long start; + unsigned long end; +}; + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + pfn = pte_pfn(pte); + + if (pte_none(pte)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } + + if (!pte_present(pte)) { + mpfn = pfn = 0; + + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = device_private_entry_to_page(entry); + mpfn = migrate_pfn(page_to_pfn(page))| + MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + if (is_write_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } + page = _vm_normal_page(migrate->vma, addr, pte, true); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = pfn = 0; + goto next; + } + pfn = page_to_pfn(page); + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + migrate->cpages++; + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + mpfn |= MIGRATE_PFN_LOCKED; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + entry = make_migration_entry(page, pte_write(pte)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; +} + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mm_walk mm_walk; + + mm_walk.pmd_entry = migrate_vma_collect_pmd; + mm_walk.pte_entry = NULL; + mm_walk.pte_hole = migrate_vma_collect_hole; + mm_walk.hugetlb_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.vma = migrate->vma; + mm_walk.mm = migrate->vma->vm_mm; + mm_walk.private = migrate; + + mmu_notifier_invalidate_range_start(mm_walk.mm, + migrate->start, + migrate->end); + walk_page_range(migrate->start, migrate->end, &mm_walk); + mmu_notifier_invalidate_range_end(mm_walk.mm, + migrate->start, + migrate->end); + + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * migrate_page_move_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) { + /* + * Private page can never be pin as they have no valid pte and + * GUP will fail for those. Yet if there is a pending migration + * a thread might try to wait on the pte migration entry and + * will bump the page reference count. Sadly there is no way to + * differentiate a regular pin from migration wait. Hence to + * avoid 2 racing thread trying to migrate back to CPU to enter + * infinite loop (one stoping migration because the other is + * waiting on pte migration entry). We always return true here. + * + * FIXME proper solution is to rework migration_entry_wait() so + * it does not need to take a reference on page. + */ + if (is_device_private_page(page)) + return true; + + /* + * Only allow device public page to be migrated and account for + * the extra reference count imply by ZONE_DEVICE pages. + */ + if (!is_device_public_page(page)) + return false; + extra++; + } + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_prepare() - lock pages and isolate them from the lru + * @migrate: migrate struct containing all migration information + * + * This locks pages that have been collected by migrate_vma_collect(). Once each + * page is locked it is isolated from the lru (for non-device pages). Finally, + * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be + * migrated by concurrent kernel threads. + */ +static void migrate_vma_prepare(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; (i < npages) && migrate->cpages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + bool remap = true; + + if (!page) + continue; + + if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { + /* + * Because we are migrating several pages there can be + * a deadlock between 2 concurrent migration where each + * are waiting on each other page lock. + * + * Make migrate_vma() a best effort thing and backoff + * for any page we can not lock right away. + */ + if (!trylock_page(page)) { + migrate->src[i] = 0; + migrate->cpages--; + put_page(page); + continue; + } + remap = false; + migrate->src[i] |= MIGRATE_PFN_LOCKED; + } + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + put_page(page); + } + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + if (!migrate_vma_check_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + + if (!is_zone_device_page(page)) + putback_lru_page(page); + else + put_page(page); + } + } + } + + for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_pte(page, migrate->vma, addr, page); + + migrate->src[i] = 0; + unlock_page(page); + put_page(page); + restore--; + } +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Replace page mapping (CPU page table pte) with a special migration pte entry + * and check again if it has been pinned. Pinned pages are restored because we + * cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (page_mapped(page)) { + try_to_unmap(page, flags); + if (page_mapped(page)) + goto restore; + } + + if (migrate_vma_check_page(page)) + continue; + +restore: + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } + + for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_ptes(page, page, false); + + migrate->src[i] = 0; + unlock_page(page); + restore--; + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + } +} + +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + unsigned long *dst) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + struct mem_cgroup *memcg; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * parallel threads are excluded by other means. + * + * Here we only have down_read(mmap_sem). + */ + if (pte_alloc(mm, pmdp, addr)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else if (is_device_public_page(page)) { + entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkdevmap(entry); + } + } else { + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + flush = true; + } else if (!pte_none(*ptep)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + /* + * Check for usefaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); + if (!is_zone_device_page(page)) + lru_cache_add_active_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +/* + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +static void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr, i, mmu_start; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + continue; + } + if (!notified) { + mmu_start = addr; + notified = true; + mmu_notifier_invalidate_range_start(mm, + mmu_start, + migrate->end); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i], + &migrate->dst[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_zone_device_page(newpage)) { + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when + * migrating to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (!is_device_public_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not + * supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + if (notified) + mmu_notifier_invalidate_range_end(mm, mmu_start, + migrate->end); +} + +/* + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +static void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + remove_migration_ptes(page, newpage, false); + unlock_page(page); + migrate->cpages--; + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} + +/* + * migrate_vma() - migrate a range of memory inside vma + * + * @ops: migration callback for allocating destination memory and copying + * @vma: virtual memory area containing the range to be migrated + * @start: start address of the range to migrate (inclusive) + * @end: end address of the range to migrate (exclusive) + * @src: array of hmm_pfn_t containing source pfns + * @dst: array of hmm_pfn_t containing destination pfns + * @private: pointer passed back to each of the callback + * Returns: 0 on success, error code otherwise + * + * This function tries to migrate a range of memory virtual address range, using + * callbacks to allocate and copy memory from source to destination. First it + * collects all the pages backing each virtual address in the range, saving this + * inside the src array. Then it locks those pages and unmaps them. Once the pages + * are locked and unmapped, it checks whether each page is pinned or not. Pages + * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) + * in the corresponding src array entry. It then restores any pages that are + * pinned, by remapping and unlocking those pages. + * + * At this point it calls the alloc_and_copy() callback. For documentation on + * what is expected from that callback, see struct migrate_vma_ops comments in + * include/linux/migrate.h + * + * After the alloc_and_copy() callback, this function goes over each entry in + * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then the function tries to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the struct + * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src + * array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * It then calls the finalize_and_map() callback. See comments for "struct + * migrate_vma_ops", in include/linux/migrate.h for details about + * finalize_and_map() behavior. + * + * After the finalize_and_map() callback, for successfully migrated pages, this + * function updates the CPU page table to point to new pages, otherwise it + * restores the CPU page table to point to the original source pages. + * + * Function returns 0 after the above steps, even if no pages were migrated + * (The function only returns an error if any of the arguments are invalid.) + * + * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT + * unsigned long entries. + */ +int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private) +{ + struct migrate_vma migrate; + + /* Sanity check the arguments */ + start &= PAGE_MASK; + end &= PAGE_MASK; + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + return -EINVAL; + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end <= vma->vm_start || end > vma->vm_end) + return -EINVAL; + if (!ops || !src || !dst || start >= end) + return -EINVAL; + + memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); + migrate.src = src; + migrate.dst = dst; + migrate.start = start; + migrate.npages = 0; + migrate.cpages = 0; + migrate.end = end; + migrate.vma = vma; + + /* Collect, and try to unmap source pages */ + migrate_vma_collect(&migrate); + if (!migrate.cpages) + return 0; + + /* Lock and isolate page */ + migrate_vma_prepare(&migrate); + if (!migrate.cpages) + return 0; + + /* Unmap pages */ + migrate_vma_unmap(&migrate); + if (!migrate.cpages) + return 0; + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the callback. + * + * Note that migration can fail in migrate_vma_struct_page() for each + * individual page. + */ + ops->alloc_and_copy(vma, src, dst, start, end, private); + + /* This does the real migration of struct page */ + migrate_vma_pages(&migrate); + + ops->finalize_and_map(vma, src, dst, start, end, private); + + /* Unlock and remap pages */ + migrate_vma_finalize(&migrate); + + return 0; +} +EXPORT_SYMBOL(migrate_vma); +#endif /* defined(MIGRATE_VMA_HELPER) */ diff --git a/mm/mlock.c b/mm/mlock.c index b562b5523a65..dfc6f1912176 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -365,8 +365,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * @start + PAGE_SIZE when no page could be added by the pte walk. */ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, - struct vm_area_struct *vma, int zoneid, unsigned long start, - unsigned long end) + struct vm_area_struct *vma, struct zone *zone, + unsigned long start, unsigned long end) { pte_t *pte; spinlock_t *ptl; @@ -394,7 +394,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, * Break if page could not be obtained or the page's node+zone does not * match */ - if (!page || page_zone_id(page) != zoneid) + if (!page || page_zone(page) != zone) break; /* @@ -446,7 +446,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long page_increm; struct pagevec pvec; struct zone *zone; - int zoneid; pagevec_init(&pvec, 0); /* @@ -481,7 +480,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, */ pagevec_add(&pvec, page); zone = page_zone(page); - zoneid = page_zone_id(page); /* * Try to fill the rest of pagevec using fast @@ -490,7 +488,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * pagevec. */ start = __munlock_pagevec_fill(&pvec, vma, - zoneid, start, end); + zone, start, end); __munlock_pagevec(&pvec, zone); goto next; } diff --git a/mm/mmap.c b/mm/mmap.c index 4c5981651407..680506faceae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -685,7 +685,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; - struct rb_root *root = NULL; + struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool start_changed = false, end_changed = false; @@ -3340,7 +3340,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -3356,7 +3356,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_node)) + &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } @@ -3458,7 +3458,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -3472,7 +3472,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_node)) + &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } diff --git a/mm/mprotect.c b/mm/mprotect.c index bd0f409922cb..6d3e2f082290 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -125,6 +125,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pages++; } + + if (is_write_device_private_entry(entry)) { + pte_t newpte; + + /* + * We do not preserve soft-dirtiness. See + * copy_one_pte() for explanation. + */ + make_device_private_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_pte_at(mm, addr, pte, newpte); + + pages++; + } } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -149,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) continue; @@ -159,7 +173,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); } else { diff --git a/mm/mremap.c b/mm/mremap.c index 7395564daa6c..cfec004c4ff9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -223,7 +223,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9add06fe768..c841af88836a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order) static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - enum zone_stat_item local_stat = NUMA_LOCAL; + enum numa_stat_item local_stat = NUMA_LOCAL; if (z->node != numa_node_id()) local_stat = NUMA_OTHER; if (z->node == preferred_zone->node) - __inc_zone_state(z, NUMA_HIT); + __inc_numa_state(z, NUMA_HIT); else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); + __inc_numa_state(z, NUMA_MISS); + __inc_numa_state(preferred_zone, NUMA_FOREIGN); } - __inc_zone_state(z, local_stat); + __inc_numa_state(z, local_stat); #endif } @@ -4183,10 +4183,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; + alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8ec6ba230bb9..6a03946469a9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) if (!is_swap_pte(*pvmw->pte)) return false; entry = pte_to_swp_entry(*pvmw->pte); + if (!is_migration_entry(entry)) return false; if (migration_entry_to_page(entry) - pvmw->page >= @@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) WARN_ON_ONCE(1); #endif } else { + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(*pvmw->pte); + if (is_device_private_entry(entry) && + device_private_entry_to_page(entry) == pvmw->page) + return true; + } + if (!pte_present(*pvmw->pte)) return false; @@ -138,16 +148,28 @@ restart: if (!pud_present(*pud)) return false; pvmw->pmd = pmd_offset(pud, pvmw->address); - if (pmd_trans_huge(*pvmw->pmd)) { + if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); - if (!pmd_present(*pvmw->pmd)) - return not_found(pvmw); if (likely(pmd_trans_huge(*pvmw->pmd))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); if (pmd_page(*pvmw->pmd) != page) return not_found(pvmw); return true; + } else if (!pmd_present(*pvmw->pmd)) { + if (thp_migration_supported()) { + if (!(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); + + if (migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; + } + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + return not_found(pvmw); } else { /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c99d9512a45b..1175f6a24fdb 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; diff --git a/mm/rmap.c b/mm/rmap.c index c570f82e6827..b874c4761e84 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -63,6 +63,7 @@ #include <linux/hugetlb.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> +#include <linux/memremap.h> #include <asm/tlbflush.h> @@ -390,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { + if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->degree--; continue; } @@ -424,7 +425,7 @@ static void anon_vma_ctor(void *data) init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); - anon_vma->rb_root = RB_ROOT; + anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) @@ -1346,9 +1347,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; + if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && + is_zone_device_page(page) && !is_device_private_page(page)) + return true; + if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, - flags & TTU_MIGRATION, page); + flags & TTU_SPLIT_FREEZE, page); } /* @@ -1360,6 +1365,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte && (flags & TTU_MIGRATION)) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + + if (!PageAnon(page)) + continue; + + set_pmd_migration_entry(&pvmw, page); + continue; + } +#endif + /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced @@ -1390,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address = pvmw.address; + if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION) && + is_zone_device_page(page)) { + swp_entry_t entry; + pte_t swp_pte; + + pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, 0); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + goto discard; + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { @@ -1445,7 +1484,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION)) { + (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { swp_entry_t entry; pte_t swp_pte; /* @@ -1575,7 +1614,8 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) + if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) + && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) diff --git a/mm/slub.c b/mm/slub.c index ddb04576b342..d39a5d3834b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4232,7 +4232,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); diff --git a/mm/sparse.c b/mm/sparse.c index a9783acf2bb9..83b3bf6461af 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -626,7 +626,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; /* onlining code should never touch invalid ranges */ diff --git a/mm/swap.c b/mm/swap.c index 62d96b8e5eb3..9295ae960d66 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold) if (is_huge_zero_page(page)) continue; + /* Device public page can not be huge page */ + if (is_device_public_page(page)) { + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, + flags); + locked_pgdat = NULL; + } + put_zone_device_private_or_public_page(page); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; diff --git a/mm/swapfile.c b/mm/swapfile.c index d483278ee35b..bf91dc9e7a79 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3290,7 +3290,8 @@ bad_swap: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); - vfree(cluster_info); + kvfree(cluster_info); + kvfree(frontswap_map); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { inode_unlock(inode); diff --git a/mm/vmstat.c b/mm/vmstat.c index c7e4b8458023..4bb13e72ac97 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STATS_THRESHOLD (U16_MAX - 2) + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -87,8 +89,10 @@ void vm_events_fold_cpu(int cpu) * vm_stat contains the global counters */ atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_numa_stat); EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -604,6 +608,32 @@ EXPORT_SYMBOL(dec_node_page_state); * Fold a differential into the global counters. * Returns the number of counters updated. */ +#ifdef CONFIG_NUMA +static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) +{ + int i; + int changes = 0; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (numa_diff[i]) { + atomic_long_add(numa_diff[i], &vm_numa_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); + changes++; + } + return changes; +} +#else static int fold_diff(int *zone_diff, int *node_diff) { int i; @@ -622,6 +652,7 @@ static int fold_diff(int *zone_diff, int *node_diff) } return changes; } +#endif /* CONFIG_NUMA */ /* * Update the zone counters for the current cpu. @@ -645,6 +676,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; +#ifdef CONFIG_NUMA + int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; +#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; @@ -666,6 +700,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } #ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); + if (v) { + + atomic_long_add(v, &zone->vm_numa_stat[i]); + global_numa_diff[i] += v; + __this_cpu_write(p->expire, 3); + } + } + if (do_pagesets) { cond_resched(); /* @@ -712,7 +758,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } +#ifdef CONFIG_NUMA + changes += fold_diff(global_zone_diff, global_numa_diff, + global_node_diff); +#else changes += fold_diff(global_zone_diff, global_node_diff); +#endif return changes; } @@ -727,6 +778,9 @@ void cpu_vm_stats_fold(int cpu) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; +#ifdef CONFIG_NUMA + int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; +#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { @@ -743,6 +797,18 @@ void cpu_vm_stats_fold(int cpu) atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (p->vm_numa_stat_diff[i]) { + int v; + + v = p->vm_numa_stat_diff[i]; + p->vm_numa_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_numa_stat[i]); + global_numa_diff[i] += v; + } +#endif } for_each_online_pgdat(pgdat) { @@ -761,7 +827,11 @@ void cpu_vm_stats_fold(int cpu) } } +#ifdef CONFIG_NUMA + fold_diff(global_zone_diff, global_numa_diff, global_node_diff); +#else fold_diff(global_zone_diff, global_node_diff); +#endif } /* @@ -779,10 +849,36 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &vm_zone_stat[i]); } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (pset->vm_numa_stat_diff[i]) { + int v = pset->vm_numa_stat_diff[i]; + + pset->vm_numa_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_numa_stat[i]); + atomic_long_add(v, &vm_numa_stat[i]); + } +#endif } #endif #ifdef CONFIG_NUMA +void __inc_numa_state(struct zone *zone, + enum numa_stat_item item) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + u16 __percpu *p = pcp->vm_numa_stat_diff + item; + u16 v; + + v = __this_cpu_inc_return(*p); + + if (unlikely(v > NUMA_STATS_THRESHOLD)) { + zone_numa_state_add(v, zone, item); + __this_cpu_write(*p, 0); + } +} + /* * Determine the per node value of a stat item. This function * is called frequently in a NUMA machine, so try to be as @@ -802,6 +898,23 @@ unsigned long sum_zone_node_page_state(int node, } /* + * Determine the per node value of a numa stat item. To avoid deviation, + * the per cpu stat number in vm_numa_stat_diff[] is also included. + */ +unsigned long sum_zone_numa_state(int node, + enum numa_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_numa_state_snapshot(zones + i, item); + + return count; +} + +/* * Determine the per node value of a stat item. */ unsigned long node_page_state(struct pglist_data *pgdat, @@ -937,6 +1050,9 @@ const char * const vmstat_text[] = { #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif + "nr_free_cma", + + /* enum numa_stat_item counters */ #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -945,7 +1061,6 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif - "nr_free_cma", /* Node-based counters */ "nr_inactive_anon", @@ -1106,7 +1221,6 @@ const char * const vmstat_text[] = { }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ - #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_PROC_FS) static void *frag_start(struct seq_file *m, loff_t *pos) @@ -1384,7 +1498,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + + NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); } } @@ -1421,6 +1536,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + zone_numa_state_snapshot(zone, i)); +#endif + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1497,6 +1619,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); @@ -1512,6 +1635,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v[i] = global_zone_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + v[i] = global_numa_state(i); + v += NR_VM_NUMA_STAT_ITEMS; +#endif + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) v[i] = global_node_page_state(i); v += NR_VM_NODE_STAT_ITEMS; @@ -1613,6 +1742,16 @@ int vmstat_refresh(struct ctl_table *table, int write, err = -EINVAL; } } +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_numa_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); + err = -EINVAL; + } + } +#endif if (err) return err; if (write) @@ -1654,13 +1793,20 @@ static bool need_update(int cpu) struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); +#ifdef CONFIG_NUMA + BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); +#endif + /* * The fast way of checking if there are any vmstat diffs. * This works because the diffs are byte sized items. */ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) return true; - +#ifdef CONFIG_NUMA + if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) + return true; +#endif } return false; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 62457eb82330..7c38e850a8fc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -551,20 +551,23 @@ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } +/* type can be of enum type zs_stat_type or fullness_group */ static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) + int type, unsigned long cnt) { class->stats.objs[type] += cnt; } +/* type can be of enum type zs_stat_type or fullness_group */ static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) + int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } +/* type can be of enum type zs_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) + int type) { return class->stats.objs[type]; } @@ -1969,6 +1972,14 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, unsigned int obj_idx; int ret = -EAGAIN; + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the zs lock, which does not work with + * MIGRATE_SYNC_NO_COPY workflow. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); |