summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c71
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/mempolicy.c34
-rw-r--r--mm/mmap.c25
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/vmscan.c22
10 files changed, 134 insertions, 74 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 622cced74fd9..e84a10b0d310 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,40 +629,30 @@ release:
* available
* never: never stall for any thp allocation
*/
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
- gfp_t this_node = 0;
-
-#ifdef CONFIG_NUMA
- struct mempolicy *pol;
- /*
- * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
- * specified, to express a general desire to stay on the current
- * node for optimistic allocation attempts. If the defrag mode
- * and/or madvise hint requires the direct reclaim then we prefer
- * to fallback to other node rather than node reclaim because that
- * can lead to excessive reclaim even though there is free memory
- * on other nodes. We expect that NUMA preferences are specified
- * by memory policies.
- */
- pol = get_vma_policy(vma, addr);
- if (pol->mode != MPOL_BIND)
- this_node = __GFP_THISNODE;
- mpol_cond_put(pol);
-#endif
+ /* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ /* Kick kcompactd and fail quickly */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- __GFP_KSWAPD_RECLAIM | this_node);
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+
+ /* Only do synchronous compaction if madvised */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- this_node);
- return GFP_TRANSHUGE_LIGHT | this_node;
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+ return GFP_TRANSHUGE_LIGHT;
}
/* Caller must hold page table lock. */
@@ -734,8 +724,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
- gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
- page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+ gfp = alloc_hugepage_direct_gfpmask(vma);
+ page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1305,9 +1295,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
- new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
- haddr, numa_node_id());
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -2155,23 +2144,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
old_pmd = pmdp_invalidate(vma, haddr, pmd);
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
pmd_migration = is_pmd_migration_entry(old_pmd);
- if (pmd_migration) {
+ if (unlikely(pmd_migration)) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_to_page(swp_offset(entry));
- } else
-#endif
+ write = is_write_migration_entry(entry);
+ young = false;
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ } else {
page = pmd_page(old_pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+ write = pmd_write(old_pmd);
+ young = pmd_young(old_pmd);
+ soft_dirty = pmd_soft_dirty(old_pmd);
+ }
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
- if (pmd_dirty(old_pmd))
- SetPageDirty(page);
- write = pmd_write(old_pmd);
- young = pmd_young(old_pmd);
- soft_dirty = pmd_soft_dirty(old_pmd);
/*
* Withdraw the table only after we mark the pmd entry invalid.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 705a3e9cc910..a80832487981 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1248,10 +1248,11 @@ void free_huge_page(struct page *page)
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;
- set_page_private(page, 0);
- page->mapping = NULL;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
diff --git a/mm/memblock.c b/mm/memblock.c
index 9a2d5ae81ae1..81ae63ca78d0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1727,7 +1727,7 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
-bool __init memblock_is_reserved(phys_addr_t addr)
+bool __init_memblock memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0cd3de3550f0..7c72f2a95785 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1161,6 +1161,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
LIST_HEAD(tokill);
int rc = -EBUSY;
loff_t start;
+ dax_entry_t cookie;
/*
* Prevent the inode from being freed while we are interrogating
@@ -1169,7 +1170,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- if (!dax_lock_mapping_entry(page))
+ cookie = dax_lock_page(page);
+ if (!cookie)
goto out;
if (hwpoison_filter(page)) {
@@ -1220,7 +1222,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
- dax_unlock_mapping_entry(page);
+ dax_unlock_page(page, cookie);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5837a067124d..d4496d9d34f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
} else if (PageTransHuge(page)) {
struct page *thp;
- thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
- address, numa_node_id());
+ thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+ HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,6 +2011,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
@@ -2021,7 +2022,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node)
+ unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
@@ -2039,6 +2040,31 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
}
+ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+ int hpage_node = node;
+
+ /*
+ * For hugepage allocation and non-interleave policy which
+ * allows the current node (or other explicitly preferred
+ * node) we only try to allocate from the current/preferred
+ * node and don't fall back to other nodes, as the cost of
+ * remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave, or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+ hpage_node = pol->v.preferred_node;
+
+ nmask = policy_nodemask(gfp, pol);
+ if (!nmask || node_isset(hpage_node, *nmask)) {
+ mpol_cond_put(pol);
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_THISNODE, order);
+ goto out;
+ }
+ }
+
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6c04292e16a7..7bb64381e77c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2066,6 +2066,15 @@ found_highest:
return gap_end;
}
+
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr) (TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -2085,8 +2094,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -2095,7 +2105,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
@@ -2104,7 +2114,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = mmap_end;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
@@ -2124,9 +2134,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
/* requested length too big for entire address space */
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -2136,7 +2147,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
@@ -2145,7 +2156,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -2159,7 +2170,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
+ info.high_limit = mmap_end;
addr = vm_unmapped_area(&info);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ec9cc407216..e95b5b7c9c3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5542,6 +5542,18 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
cond_resched();
}
}
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * If the zone does not span the rest of the section then
+ * we should at least initialize those pages. Otherwise we
+ * could blow up on a poisoned page in some paths which depend
+ * on full sections being initialized (e.g. memory hotplug).
+ */
+ while (end_pfn % PAGES_PER_SECTION) {
+ __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
+ end_pfn++;
+ }
+#endif
}
#ifdef CONFIG_ZONE_DEVICE
@@ -7802,11 +7814,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
- if (!hugepage_migration_supported(page_hstate(page)))
+ if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+ skip_pages = (1 << compound_order(head)) - (page - head);
+ iter += skip_pages - 1;
continue;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index cddc72ac44d8..375f3ac19bb8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -661,9 +661,7 @@ static int shmem_free_swap(struct address_space *mapping,
{
void *old;
- xa_lock_irq(&mapping->i_pages);
- old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
- xa_unlock_irq(&mapping->i_pages);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
if (old != radswap)
return -ENOENT;
free_swap_and_cache(radix_to_swp_entry(radswap));
@@ -760,7 +758,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
break;
index = indices[pvec.nr - 1] + 1;
pagevec_remove_exceptionals(&pvec);
- check_move_unevictable_pages(pvec.pages, pvec.nr);
+ check_move_unevictable_pages(&pvec);
pagevec_release(&pvec);
cond_resched();
}
@@ -1439,7 +1437,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
+ HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 33307fc05c4d..3abc8cc50201 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -240,6 +240,22 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
}
/*
+ * Mark all memblocks as present using memory_present(). This is a
+ * convienence function that is useful for a number of arches
+ * to mark all of the systems memory as present during initialization.
+ */
+void __init memblocks_present(void)
+{
+ struct memblock_region *reg;
+
+ for_each_memblock(memory, reg) {
+ memory_present(memblock_get_region_node(reg),
+ memblock_region_memory_base_pfn(reg),
+ memblock_region_memory_end_pfn(reg));
+ }
+}
+
+/*
* Subtle, we encode the real pfn into the mem_map such that
* the identity pfn - section_mem_map will return the actual
* physical page frame number.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62ac0c488624..24ab1f7394ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
+#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
@@ -4182,17 +4183,16 @@ int page_evictable(struct page *page)
return ret;
}
-#ifdef CONFIG_SHMEM
/**
- * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
- * @pages: array of pages to check
- * @nr_pages: number of pages to check
+ * check_move_unevictable_pages - check pages for evictability and move to
+ * appropriate zone lru list
+ * @pvec: pagevec with lru pages to check
*
- * Checks pages for evictability and moves them to the appropriate lru list.
- *
- * This function is only used for SysV IPC SHM_UNLOCK.
+ * Checks pages for evictability, if an evictable page is in the unevictable
+ * lru list, moves it to the appropriate evictable lru list. This function
+ * should be only used for lru pages.
*/
-void check_move_unevictable_pages(struct page **pages, int nr_pages)
+void check_move_unevictable_pages(struct pagevec *pvec)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = NULL;
@@ -4200,8 +4200,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
int pgrescued = 0;
int i;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pages[i];
+ for (i = 0; i < pvec->nr; i++) {
+ struct page *page = pvec->pages[i];
struct pglist_data *pagepgdat = page_pgdat(page);
pgscanned++;
@@ -4233,4 +4233,4 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
spin_unlock_irq(&pgdat->lru_lock);
}
}
-#endif /* CONFIG_SHMEM */
+EXPORT_SYMBOL_GPL(check_move_unevictable_pages);