summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/hmm.c4
-rw-r--r--mm/huge_memory.c82
-rw-r--r--mm/hugetlb.c377
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/khugepaged.c15
-rw-r--r--mm/kmemleak.c1
-rw-r--r--mm/memcontrol.c271
-rw-r--r--mm/memory.c80
-rw-r--r--mm/memory_hotplug.c9
-rw-r--r--mm/mempolicy.c39
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page_alloc.c174
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_owner.c20
-rw-r--r--mm/pgtable-generic.c6
-rw-r--r--mm/shmem.c59
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab.h3
-rw-r--r--mm/slab_common.c56
-rw-r--r--mm/slub.c12
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c27
-rw-r--r--mm/truncate.c23
-rw-r--r--mm/vmscan.c86
-rw-r--r--mm/zpool.c25
-rw-r--r--mm/zsmalloc.c31
-rw-r--r--mm/zswap.c91
35 files changed, 844 insertions, 748 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 03ff7703d322..c782e8fb7235 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB
A sane initial value is 80 MB.
-# For architectures that support deferred memory initialisation
-config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
- bool
-
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
default n
- depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
- depends on NO_BOOTMEM && MEMORY_HOTPLUG
+ depends on NO_BOOTMEM
depends on !FLATMEM
help
Ordinarily all struct pages are initialised during early boot in a
diff --git a/mm/compaction.c b/mm/compaction.c
index 10cd757f1006..2c8999d027ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
- * @mode: The migration mode for async, sync light, or sync migration
+ * @prio: Determines how hard direct compaction should try to succeed
*
* This is the main entry point for direct page compaction.
*/
diff --git a/mm/fadvise.c b/mm/fadvise.c
index ec70d6e4b86d..767887f5f3bf 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
*/
start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
end_index = (endbyte >> PAGE_SHIFT);
- if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
+ /*
+ * The page at end_index will be inclusively discarded according
+ * by invalidate_mapping_pages(), so subtracting 1 from
+ * end_index means we will skip the last page. But if endbyte
+ * is page aligned or is at the end of file, we should not skip
+ * that page - discarding the last page is safe enough.
+ */
+ if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
+ endbyte != inode->i_size - 1) {
/* First page is tricky as 0 - 1 = -1, but pgoff_t
* is unsigned, so the end_index >= start_index
* check below would be true and we'll discard the whole
diff --git a/mm/filemap.c b/mm/filemap.c
index ee83baaf855d..693f62212a59 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,7 +31,6 @@
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
-#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
diff --git a/mm/hmm.c b/mm/hmm.c
index ea19742a5d60..979211c7ccc8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -418,7 +418,7 @@ again:
}
if (!pte_present(pte)) {
- swp_entry_t entry;
+ swp_entry_t entry = pte_to_swp_entry(pte);
if (!non_swap_entry(entry)) {
if (hmm_vma_walk->fault)
@@ -426,8 +426,6 @@ again:
continue;
}
- entry = pte_to_swp_entry(pte);
-
/*
* This is a special swap entry, ignore migration, use
* device and report anything else as error.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0e7ded98d114..87ab9b8f56b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmdp_invalidate() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
- entry = *pmd;
- pmdp_invalidate(vma, addr, pmd);
-
- /*
- * Recover dirty/young flags. It relies on pmdp_invalidate to not
- * corrupt them.
- */
- if (pmd_dirty(*pmd))
- entry = pmd_mkdirty(entry);
- if (pmd_young(*pmd))
- entry = pmd_mkyoung(entry);
+ entry = pmdp_invalidate(vma, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
@@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct mm_struct *mm = vma->vm_mm;
struct page *page;
pgtable_t pgtable;
- pmd_t _pmd;
- bool young, write, dirty, soft_dirty, pmd_migration = false;
+ pmd_t old_pmd, _pmd;
+ bool young, write, soft_dirty, pmd_migration = false;
unsigned long addr;
int i;
@@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
+ /*
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+ * 383 on page 93. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * must remain set at all times on the pmd until the split is complete
+ * for this pmd), then we flush the SMP TLB and finally we write the
+ * non-huge version of the pmd entry with pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- pmd_migration = is_pmd_migration_entry(*pmd);
+ pmd_migration = is_pmd_migration_entry(old_pmd);
if (pmd_migration) {
swp_entry_t entry;
- entry = pmd_to_swp_entry(*pmd);
+ entry = pmd_to_swp_entry(old_pmd);
page = pfn_to_page(swp_offset(entry));
} else
#endif
- page = pmd_page(*pmd);
+ page = pmd_page(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
- write = pmd_write(*pmd);
- young = pmd_young(*pmd);
- dirty = pmd_dirty(*pmd);
- soft_dirty = pmd_soft_dirty(*pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+ write = pmd_write(old_pmd);
+ young = pmd_young(old_pmd);
+ soft_dirty = pmd_soft_dirty(old_pmd);
- pmdp_huge_split_prepare(vma, haddr, pmd);
+ /*
+ * Withdraw the table only after we mark the pmd entry invalid.
+ * This's critical for some architectures (Power).
+ */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
}
- if (dirty)
- SetPageDirty(page + i);
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
@@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * and pmd_trans_splitting must remain set at all times on the pmd
- * until the split is complete for this pmd), then we flush the SMP TLB
- * and finally we write the non-huge version of the pmd entry with
- * pmd_populate.
- */
- pmdp_invalidate(vma, haddr, pmd);
pmd_populate(mm, pmd, pgtable);
if (freeze) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9a334f5fb730..7c204e3d132b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,10 +34,9 @@
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
#include "internal.h"
-int hugepages_treat_as_movable;
-
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
@@ -926,7 +925,7 @@ retry_cpuset:
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepages_treat_as_movable || hugepage_migration_supported(h))
+ if (hugepage_migration_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
unsigned int order = huge_page_order(h);
unsigned long nr_pages = 1 << order;
@@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- gfp_t gfp_mask;
- gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
zonelist = node_zonelist(nid, gfp_mask);
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
spin_lock_irqsave(&zone->lock, flags);
pfn = ALIGN(zone->zone_start_pfn, nr_pages);
@@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
- struct page *page;
-
- page = alloc_gigantic_page(nid, h);
- if (page) {
- prep_compound_gigantic_page(page, huge_page_order(h));
- prep_new_huge_page(h, page, nid);
- }
-
- return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- struct page *page = NULL;
- int nr_nodes, node;
-
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_gigantic_page_node(h, node);
- if (page)
- return 1;
- }
-
- return 0;
-}
-
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static inline bool gigantic_page_supported(void) { return false; }
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask) { return NULL; }
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed) { return 0; }
#endif
static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
ClearPagePrivate(&page[1]);
}
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+ if (!PageHuge(page))
+ return false;
+
+ return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = NULL;
+}
+
void free_huge_page(struct page *page)
{
/*
@@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (h->surplus_huge_pages_node[nid]) {
+ if (PageHugeTemporary(page)) {
+ list_del(&page->lru);
+ ClearPageHugeTemporary(page);
+ update_and_free_page(h, page);
+ } else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
}
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}
-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
+ int order = huge_page_order(h);
struct page *page;
- page = __alloc_pages_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
- huge_page_order(h));
- if (page) {
- prep_new_huge_page(h, page, nid);
- }
+ gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ if (page)
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return page;
+}
+
+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ else
+ page = alloc_buddy_huge_page(h, gfp_mask,
+ nid, nmask);
+ if (!page)
+ return NULL;
+
+ if (hstate_is_gigantic(h))
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, page_to_nid(page));
return page;
}
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
int nr_nodes, node;
- int ret = 0;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page_node(h, node);
- if (page) {
- ret = 1;
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ if (page)
break;
- }
}
- if (ret)
- count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ if (!page)
+ return 0;
- return ret;
+ put_page(page); /* free it into the hugepage allocator */
+
+ return 1;
}
/*
@@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}
-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
-{
- int order = huge_page_order(h);
-
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
- return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
-}
-
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
- unsigned int r_nid;
+ struct page *page = NULL;
if (hstate_is_gigantic(h))
return NULL;
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+ goto out_unlock;
+ spin_unlock(&hugetlb_lock);
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ if (!page)
+ return NULL;
+
+ spin_lock(&hugetlb_lock);
/*
- * Assume we will successfully allocate the surplus page to
- * prevent racing processes from causing the surplus to exceed
- * overcommit
- *
- * This however introduces a different race, where a process B
- * tries to grow the static hugepage pool while alloc_pages() is
- * called by process A. B will only examine the per-node
- * counters in determining if surplus huge pages can be
- * converted to normal huge pages in adjust_pool_surplus(). A
- * won't be able to increment the per-node counter, until the
- * lock is dropped by B, but B doesn't drop hugetlb_lock until
- * no more huge pages can be converted from surplus to normal
- * state (and doesn't try to convert again). Thus, we have a
- * case where a surplus huge page exists, the pool is grown, and
- * the surplus huge page still exists after, even though it
- * should just have been converted to a normal huge page. This
- * does not leak memory, though, as the hugepage will be freed
- * once it is out of use. It also does not allow the counters to
- * go out of whack in adjust_pool_surplus() as we don't modify
- * the node values until we've gotten the hugepage and only the
- * per-node value is checked there.
+ * We could have raced with the pool size change.
+ * Double check that and simply deallocate the new page
+ * if we would end up overcommiting the surpluses. Abuse
+ * temporary page to workaround the nasty free_huge_page
+ * codeflow
*/
- spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- spin_unlock(&hugetlb_lock);
- return NULL;
+ SetPageHugeTemporary(page);
+ put_page(page);
+ page = NULL;
} else {
- h->nr_huge_pages++;
h->surplus_huge_pages++;
+ h->nr_huge_pages_node[page_to_nid(page)]++;
}
+
+out_unlock:
spin_unlock(&hugetlb_lock);
- page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+ return page;
+}
- spin_lock(&hugetlb_lock);
- if (page) {
- INIT_LIST_HEAD(&page->lru);
- r_nid = page_to_nid(page);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_hugetlb_cgroup(page, NULL);
- /*
- * We incremented the global counters already
- */
- h->nr_huge_pages_node[r_nid]++;
- h->surplus_huge_pages_node[r_nid]++;
- __count_vm_event(HTLB_BUDDY_PGALLOC);
- } else {
- h->nr_huge_pages--;
- h->surplus_huge_pages--;
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
- }
- spin_unlock(&hugetlb_lock);
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ return NULL;
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ if (!page)
+ return NULL;
+
+ /*
+ * We do not account these pages as surplus because they are only
+ * temporary and will be released properly on the last reference
+ */
+ SetPageHugeTemporary(page);
return page;
}
@@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
@@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
- page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
return page;
}
-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);
if (!page)
- page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+ page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
return page;
}
-
+/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask)
{
@@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock(&hugetlb_lock);
- /* No reservations, try to overcommit */
+ return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct page *page;
+ gfp_t gfp_mask;
+ int node;
+
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask);
+ mpol_cond_put(mpol);
- return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return page;
}
/*
@@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+ page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
@@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
- page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+ page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2074,20 +2097,6 @@ out_subpool_put:
return ERR_PTR(-ENOSPC);
}
-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
-{
- struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
- if (IS_ERR(page))
- page = NULL;
- return page;
-}
-
int alloc_bootmem_huge_page(struct hstate *h)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
@@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void)
prep_compound_huge_page(page, h->order);
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* free it into the hugepage allocator */
+
/*
* If we had gigantic hugepages allocated at boot time, we need
* to restore the 'stolen' pages to totalram_pages in order to
@@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
- } else if (!alloc_fresh_huge_page(h,
+ } else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY]))
break;
cond_resched();
@@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with __alloc_buddy_huge_page() here and be unable
+ * We might race with alloc_surplus_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
/* yield cpu to avoid soft lockup */
cond_resched();
- if (hstate_is_gigantic(h))
- ret = alloc_fresh_gigantic_page(h, nodes_allowed);
- else
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * __alloc_buddy_huge_page() is checking the global counter,
+ * alloc_surplus_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -2975,20 +2983,32 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
- struct hstate *h = &default_hstate;
+ struct hstate *h;
+ unsigned long total = 0;
+
if (!hugepages_supported())
return;
- seq_printf(m,
- "HugePages_Total: %5lu\n"
- "HugePages_Free: %5lu\n"
- "HugePages_Rsvd: %5lu\n"
- "HugePages_Surp: %5lu\n"
- "Hugepagesize: %8lu kB\n",
- h->nr_huge_pages,
- h->free_huge_pages,
- h->resv_huge_pages,
- h->surplus_huge_pages,
- 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+
+ for_each_hstate(h) {
+ unsigned long count = h->nr_huge_pages;
+
+ total += (PAGE_SIZE << huge_page_order(h)) * count;
+
+ if (h == &default_hstate)
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
+ "HugePages_Surp: %5lu\n"
+ "Hugepagesize: %8lu kB\n",
+ count,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ (PAGE_SIZE << huge_page_order(h)) / 1024);
+ }
+
+ seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
}
int hugetlb_report_node_meminfo(int nid, char *buf)
@@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page)
spin_unlock(&hugetlb_lock);
put_page(page);
}
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+ struct hstate *h = page_hstate(oldpage);
+
+ hugetlb_cgroup_migrate(oldpage, newpage);
+ set_page_owner_migrate_reason(newpage, reason);
+
+ /*
+ * transfer temporary state of the new huge page. This is
+ * reverse to other transitions because the newpage is going to
+ * be final while the old one will be freed so it takes over
+ * the temporary status.
+ *
+ * Also note that we have to transfer the per-node surplus state
+ * here as well otherwise the global surplus count will not match
+ * the per-node's.
+ */
+ if (PageHugeTemporary(newpage)) {
+ int old_nid = page_to_nid(oldpage);
+ int new_nid = page_to_nid(newpage);
+
+ SetPageHugeTemporary(oldpage);
+ ClearPageHugeTemporary(newpage);
+
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages_node[old_nid]) {
+ h->surplus_huge_pages_node[old_nid]--;
+ h->surplus_huge_pages_node[new_nid]++;
+ }
+ spin_unlock(&hugetlb_lock);
+ }
+}
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index b47664358796..27ddfd29112a 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
- return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+ return v->vm_pgoff + vma_pages(v) - 1;
}
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ea4ff259b671..b7e2268dfc9a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm,
}
if (page_mapped(page))
- unmap_mapping_range(mapping, index << PAGE_SHIFT,
- PAGE_SIZE, 0);
+ unmap_mapping_pages(mapping, index, 1, false);
spin_lock_irq(&mapping->tree_lock);
@@ -1674,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
- down_read(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
- vma = NULL;
- else
+ /*
+ * Don't wait for semaphore (to avoid long wait times). Just move to
+ * the next mm on the list.
+ */
+ vma = NULL;
+ if (unlikely(!down_read_trylock(&mm->mmap_sem)))
+ goto breakouterloop_mmap_sem;
+ if (likely(!khugepaged_test_exit(mm)))
vma = find_vma(mm, khugepaged_scan.address);
progress++;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f656ca27f6c2..e83987c55a08 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -91,7 +91,6 @@
#include <linux/stacktrace.h>
#include <linux/cache.h>
#include <linux/percpu.h>
-#include <linux/hardirq.h>
#include <linux/bootmem.h>
#include <linux/pfn.h>
#include <linux/mmzone.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9011997d8a5c..0ae2dc3a1748 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
-/*
- * Return page count for single (non recursive) @memcg.
- *
- * Implementation Note: reading percpu statistics for memcg.
- *
- * Both of vmstat[] and percpu_counter has threshold and do periodic
- * synchronization to implement "quick" read. There are trade-off between
- * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronization of counter in memcg's counter.
- *
- * But this _read() function is used for user interface now. The user accounts
- * memory usage by memory cgroup and he _always_ requires exact value because
- * he accounts memory. Even if we provide quick-and-fuzzy read, we always
- * have to visit all online cpus and make sum. So, for now, unnecessary
- * synchronization is not implemented. (just implemented for cpu hotplug)
- *
- * If there are kernel internal actions which can make use of some not-exact
- * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threshold and synchronization as vmstat[] should be
- * implemented.
- *
- * The parameter idx can be of type enum memcg_event_item or vm_event_item.
- */
-
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
int event)
{
- unsigned long val = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->events[event], cpu);
- return val;
+ return atomic_long_read(&memcg->events[event]);
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -586,27 +557,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
* counted as CACHE even if it's on ANON LRU.
*/
if (PageAnon(page))
- __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
+ __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
else {
- __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
+ __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
if (PageSwapBacked(page))
- __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
+ __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
}
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
+ __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
}
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(memcg->stat->events[PGPGIN]);
+ __count_memcg_events(memcg, PGPGIN, 1);
else {
- __this_cpu_inc(memcg->stat->events[PGPGOUT]);
+ __count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+ __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
}
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
@@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat->nr_page_events);
- next = __this_cpu_read(memcg->stat->targets[target]);
+ val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
+ next = __this_cpu_read(memcg->stat_cpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
default:
break;
}
- __this_cpu_write(memcg->stat->targets[target], next);
+ __this_cpu_write(memcg->stat_cpu->targets[target], next);
return true;
}
return false;
@@ -1124,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
-unsigned int memcg1_stats[] = {
+static const unsigned int memcg1_stats[] = {
MEMCG_CACHE,
MEMCG_RSS,
MEMCG_RSS_HUGE,
@@ -1206,20 +1177,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
- int num = 0;
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- num++;
- return num;
-}
-
-/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
@@ -1707,11 +1664,6 @@ void unlock_page_memcg(struct page *page)
}
EXPORT_SYMBOL(unlock_page_memcg);
-/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
- */
-#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1739,7 +1691,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
unsigned long flags;
bool ret = false;
- if (nr_pages > CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
local_irq_save(flags);
@@ -1808,7 +1760,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
}
stock->nr_pages += nr_pages;
- if (stock->nr_pages > CHARGE_BATCH)
+ if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
local_irq_restore(flags);
@@ -1858,9 +1810,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
+
+ for_each_mem_cgroup(memcg) {
+ int i;
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ int nid;
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+ if (x)
+ atomic_long_add(x, &memcg->stat[i]);
+
+ if (i >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn;
+
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
+ if (x)
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ }
+ }
+
+ for (i = 0; i < MEMCG_NR_EVENTS; i++) {
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+ if (x)
+ atomic_long_add(x, &memcg->events[i]);
+ }
+ }
+
return 0;
}
@@ -1881,7 +1868,7 @@ static void high_work_func(struct work_struct *work)
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, high_work);
- reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
+ reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}
/*
@@ -1905,7 +1892,7 @@ void mem_cgroup_handle_over_high(void)
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
- unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
@@ -2415,18 +2402,11 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++)
head[i].mem_cgroup = head->mem_cgroup;
- __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
- HPAGE_PMD_NR);
+ __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- int nr_entries)
-{
- this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
-}
-
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
@@ -2450,8 +2430,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mem_cgroup_swap_statistics(from, -1);
- mem_cgroup_swap_statistics(to, 1);
+ mod_memcg_state(from, MEMCG_SWAP, -1);
+ mod_memcg_state(to, MEMCG_SWAP, 1);
return 0;
}
return -EINVAL;
@@ -2467,23 +2447,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+ unsigned long limit, bool memsw)
{
- unsigned long curusage;
- unsigned long oldusage;
bool enlarge = false;
- int retry_count;
int ret;
-
- /*
- * For keeping hierarchical_reclaim simple, how long we should retry
- * is depends on callers. We set our retry-count to be function
- * of # of children which we should visit in this loop.
- */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memory);
+ bool limits_invariant;
+ struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
do {
if (signal_pending(current)) {
@@ -2492,79 +2461,31 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
}
mutex_lock(&memcg_limit_mutex);
- if (limit > memcg->memsw.limit) {
+ /*
+ * Make sure that the new limit (memsw or memory limit) doesn't
+ * break our basic invariant rule memory.limit <= memsw.limit.
+ */
+ limits_invariant = memsw ? limit >= memcg->memory.limit :
+ limit <= memcg->memsw.limit;
+ if (!limits_invariant) {
mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
break;
}
- if (limit > memcg->memory.limit)
+ if (limit > counter->limit)
enlarge = true;
- ret = page_counter_limit(&memcg->memory, limit);
+ ret = page_counter_limit(counter, limit);
mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-
- curusage = page_counter_read(&memcg->memory);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
-
- if (!ret && enlarge)
- memcg_oom_recover(memcg);
-
- return ret;
-}
-
-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
- unsigned long limit)
-{
- unsigned long curusage;
- unsigned long oldusage;
- bool enlarge = false;
- int retry_count;
- int ret;
-
- /* see mem_cgroup_resize_res_limit */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memsw);
-
- do {
- if (signal_pending(current)) {
- ret = -EINTR;
+ if (!try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, !memsw)) {
+ ret = -EBUSY;
break;
}
-
- mutex_lock(&memcg_limit_mutex);
- if (limit < memcg->memory.limit) {
- mutex_unlock(&memcg_limit_mutex);
- ret = -EINVAL;
- break;
- }
- if (limit > memcg->memsw.limit)
- enlarge = true;
- ret = page_counter_limit(&memcg->memsw, limit);
- mutex_unlock(&memcg_limit_mutex);
-
- if (!ret)
- break;
-
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-
- curusage = page_counter_read(&memcg->memsw);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
@@ -3020,10 +2941,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
break;
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
@@ -4168,8 +4089,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
- if (!pn->lruvec_stat) {
+ pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat_cpu) {
kfree(pn);
return 1;
}
@@ -4187,7 +4108,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- free_percpu(pn->lruvec_stat);
+ free_percpu(pn->lruvec_stat_cpu);
kfree(pn);
}
@@ -4197,7 +4118,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat);
+ free_percpu(memcg->stat_cpu);
kfree(memcg);
}
@@ -4226,8 +4147,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (memcg->id.id < 0)
goto fail;
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
+ memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat_cpu)
goto fail;
for_each_node(node)
@@ -4584,8 +4505,8 @@ static int mem_cgroup_move_account(struct page *page,
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
- __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
+ __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
+ __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -4597,16 +4518,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
- nr_pages);
- __this_cpu_add(to->stat->count[NR_FILE_DIRTY],
- nr_pages);
+ __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
+ __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
- __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
+ __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
+ __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
}
/*
@@ -5642,12 +5561,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
}
local_irq_save(flags);
- __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
- __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
- __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
- __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
- __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
- __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+ __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
+ __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
+ __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
+ __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
+ __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
+ __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -5874,7 +5793,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
if (in_softirq())
gfp_mask = GFP_NOWAIT;
- this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+ mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
return true;
@@ -5895,7 +5814,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
return;
}
- this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+ mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
refill_stock(memcg, nr_pages);
}
@@ -6019,7 +5938,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
nr_entries);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, nr_entries);
+ mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
page->mem_cgroup = NULL;
@@ -6085,7 +6004,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, nr_pages);
+ mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
return 0;
}
@@ -6113,7 +6032,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
else
page_counter_uncharge(&memcg->memsw, nr_pages);
}
- mem_cgroup_swap_statistics(memcg, -nr_pages);
+ mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
diff --git a/mm/memory.c b/mm/memory.c
index 793004608332..53373b7a1512 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-/* tlb_gather_mmu
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @fullmm argument is used when @mm is without
- * users and we're going to destroy the full address space (exit/execve).
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @start and @end are set to 0 and -1
+ * respectively when @mm is without users and we're going to destroy
+ * the full address space (exit/execve).
*/
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -2792,8 +2799,37 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
+ * unmap_mapping_pages() - Unmap pages from processes.
+ * @mapping: The address space containing pages to be unmapped.
+ * @start: Index of first page to be unmapped.
+ * @nr: Number of pages to be unmapped. 0 to unmap to end of file.
+ * @even_cows: Whether to unmap even private COWed pages.
+ *
+ * Unmap the pages in this address space from any userspace process which
+ * has them mmaped. Generally, you want to remove COWed pages as well when
+ * a file is being truncated, but not when invalidating pages from the page
+ * cache.
+ */
+void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
+ pgoff_t nr, bool even_cows)
+{
+ struct zap_details details = { };
+
+ details.check_mapping = even_cows ? NULL : mapping;
+ details.first_index = start;
+ details.last_index = start + nr - 1;
+ if (details.last_index < details.first_index)
+ details.last_index = ULONG_MAX;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_range - unmap the portion of all mmaps in the specified
- * address_space corresponding to the specified page range in the underlying
+ * address_space corresponding to the specified byte range in the underlying
* file.
*
* @mapping: the address space containing mmaps to be unmapped.
@@ -2811,7 +2847,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -2823,16 +2858,7 @@ void unmap_mapping_range(struct address_space *mapping,
hlen = ULONG_MAX - hba + 1;
}
- details.check_mapping = even_cows ? NULL : mapping;
- details.first_index = hba;
- details.last_index = hba + hlen - 1;
- if (details.last_index < details.first_index)
- details.last_index = ULONG_MAX;
-
- i_mmap_lock_write(mapping);
- if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
- i_mmap_unlock_write(mapping);
+ unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -3485,9 +3511,8 @@ static int fault_around_bytes_get(void *data, u64 *val)
}
/*
- * fault_around_pages() and fault_around_mask() expects fault_around_bytes
- * rounded down to nearest page order. It's what do_fault_around() expects to
- * see.
+ * fault_around_bytes must be rounded down to the nearest page order as it's
+ * what do_fault_around() expects to see.
*/
static int fault_around_bytes_set(void *data, u64 val)
{
@@ -3530,13 +3555,14 @@ late_initcall(fault_around_debugfs);
* This function doesn't cross the VMA boundaries, in order to call map_pages()
* only once.
*
- * fault_around_pages() defines how many pages we'll try to map.
- * do_fault_around() expects it to return a power of two less than or equal to
- * PTRS_PER_PTE.
+ * fault_around_bytes defines how many bytes we'll try to map.
+ * do_fault_around() expects it to be set to a power of two less than or equal
+ * to PTRS_PER_PTE.
*
- * The virtual address of the area that we map is naturally aligned to the
- * fault_around_pages() value (and therefore to page order). This way it's
- * easier to guarantee that we don't cross page table boundaries.
+ * The virtual address of the area that we map is naturally aligned to
+ * fault_around_bytes rounded down to the machine page size
+ * (and therefore to page order). This way it's easier to guarantee
+ * that we don't cross page table boundaries.
*/
static int do_fault_around(struct vm_fault *vmf)
{
@@ -3553,8 +3579,8 @@ static int do_fault_around(struct vm_fault *vmf)
start_pgoff -= off;
/*
- * end_pgoff is either end of page table or end of vma
- * or fault_around_pages() from start_pgoff, depending what is nearest.
+ * end_pgoff is either the end of the page table, the end of
+ * the vma or nr_pages from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c52aa05b106c..9bbd6982d4e4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, SECTION_INFO);
- usemap = __nr_to_section(section_nr)->pageblock_flags;
+ usemap = ms->pageblock_flags;
page = virt_to_page(usemap);
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
struct mem_section *ms;
struct page *page, *memmap;
- if (!pfn_valid(start_pfn))
- return;
-
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -210,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
- usemap = __nr_to_section(section_nr)->pageblock_flags;
+ usemap = ms->pageblock_flags;
page = virt_to_page(usemap);
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -1637,7 +1634,7 @@ repeat:
goto failed_removal;
cond_resched();
- lru_add_drain_all_cpuslocked();
+ lru_add_drain_all();
drain_all_pages(zone);
pfn = scan_movable_pages(start_pfn, end_pfn);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ce44d3ff03d..d879f1d8a44a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1121,8 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
}
if (PageHuge(page)) {
- BUG_ON(!vma);
- return alloc_huge_page_noerr(vma, address, 1);
+ return alloc_huge_page_vma(page_hstate(compound_head(page)),
+ vma, address);
} else if (thp_migration_supported() && PageTransHuge(page)) {
struct page *thp;
@@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
+ unsigned long t;
unsigned long nlongs;
unsigned long endmask;
@@ -1279,13 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
+ /*
+ * When the user specified more nodes than supported just check
+ * if the non supported part is all zero.
+ *
+ * If maxnode have more longs than MAX_NUMNODES, check
+ * the bits in that area first. And then go through to
+ * check the rest bits which equal or bigger than MAX_NUMNODES.
+ * Otherwise, just check bits [MAX_NUMNODES, maxnode).
+ */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- if (nlongs > PAGE_SIZE/sizeof(long))
- return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
@@ -1298,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
endmask = ~0UL;
}
+ if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
+ unsigned long valid_mask = endmask;
+
+ valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
+ if (get_user(t, nmask + nlongs - 1))
+ return -EFAULT;
+ if (t & valid_mask)
+ return -EINVAL;
+ }
+
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
@@ -1418,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
- if (!nodes_subset(*new, node_states[N_MEMORY])) {
- err = -EINVAL;
+ task_nodes = cpuset_mems_allowed(current);
+ nodes_and(*new, *new, task_nodes);
+ if (nodes_empty(*new))
+ goto out_put;
+
+ nodes_and(*new, *new, node_states[N_MEMORY]);
+ if (nodes_empty(*new))
goto out_put;
- }
err = security_task_movememory(task);
if (err)
diff --git a/mm/migrate.c b/mm/migrate.c
index 4d0be47a322a..1e5525a25691 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
put_anon_vma(anon_vma);
if (rc == MIGRATEPAGE_SUCCESS) {
- hugetlb_cgroup_migrate(hpage, new_hpage);
+ move_hugetlb_state(hpage, new_hpage, reason);
put_new_page = NULL;
- set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 96edb33fd09a..eff6b88a993f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int id;
+ bool ret = false;
+
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
+ if (!mm_has_notifiers(mm))
+ return ret;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (!mn->ops->invalidate_range &&
+ !mn->ops->invalidate_range_start &&
+ !mn->ops->invalidate_range_end)
+ continue;
+
+ if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+ ret = true;
+ break;
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+ return ret;
+}
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 58b629bb70de..e3309fcf586b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!page || PageKsm(page))
continue;
+ /* Also skip shared copy-on-write pages */
+ if (is_cow_mapping(vma->vm_flags) &&
+ page_mapcount(page) != 1)
+ continue;
+
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index 17c00d93de2e..4b9864b17cb0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-void unmap_mapping_range(struct address_space *mapping,
- loff_t const holebegin, loff_t const holelen,
- int even_cows)
-{
-}
-EXPORT_SYMBOL(unmap_mapping_range);
-
int filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 29f855551efe..f2e7dfb81eee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * If the mm has notifiers then we would need to invalidate them around
- * unmap_page_range and that is risky because notifiers can sleep and
- * what they do is basically undeterministic. So let's have a short
+ * If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time and add
- * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ * notifiers cannot block for unbounded amount of time
*/
- if (mm_has_notifiers(mm)) {
+ if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
- unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- NULL);
- tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76c9688b6a0a..c7dd9c86e353 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
- * Determine how many pages need to be initialized durig early boot
+ * Determine how many pages need to be initialized during early boot
* (non-deferred initialization).
* The value of first_deferred_pfn will be set later, once non-deferred pages
* are initialized, but for now set it ULONG_MAX.
@@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
- /* Always populate low zones for address-contrained allocations */
+ /* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
@@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone,
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
+ unsigned long zone, int nid, bool zero)
{
- mm_zero_struct_page(page);
+ if (zero)
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
}
static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
- int nid)
+ int nid, bool zero)
{
- return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
- __init_single_pfn(pfn, zid, nid);
+ __init_single_pfn(pfn, zid, nid, true);
}
#else
static inline void init_reserved_page(unsigned long pfn)
@@ -1457,92 +1458,87 @@ static inline void __init pgdat_init_report_one_done(void)
}
/*
- * Helper for deferred_init_range, free the given range, reset the counters, and
- * return number of pages freed.
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * First we check if pfn is valid on architectures where it is possible to have
+ * holes within pageblock_nr_pages. On systems where it is not possible, this
+ * function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the validity
+ * of the head pfn.
+ *
+ * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
+ * within a node: a pfn is between start and end of a node, but does not belong
+ * to this memory node.
*/
-static inline unsigned long __init __def_free(unsigned long *nr_free,
- unsigned long *free_base_pfn,
- struct page **page)
+static inline bool __init
+deferred_pfn_valid(int nid, unsigned long pfn,
+ struct mminit_pfnnid_cache *nid_init_state)
{
- unsigned long nr = *nr_free;
+ if (!pfn_valid_within(pfn))
+ return false;
+ if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
+ return false;
+ if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
+ return false;
+ return true;
+}
- deferred_free_range(*free_base_pfn, nr);
- *free_base_pfn = 0;
- *nr_free = 0;
- *page = NULL;
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * pageblock_nr_pages sizes.
+ */
+static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ unsigned long nr_free = 0;
- return nr;
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 0;
+ } else if (!(pfn & nr_pgmask)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 1;
+ cond_resched();
+ } else {
+ nr_free++;
+ }
+ }
+ /* Free the last block of pages to allocator */
+ deferred_free_range(pfn - nr_free, nr_free);
}
-static unsigned long __init deferred_init_range(int nid, int zid,
- unsigned long start_pfn,
- unsigned long end_pfn)
+/*
+ * Initialize struct pages. We minimize pfn page lookups and scheduler checks
+ * by performing it only once every pageblock_nr_pages.
+ * Return number of pages initialized.
+ */
+static unsigned long __init deferred_init_pages(int nid, int zid,
+ unsigned long pfn,
+ unsigned long end_pfn)
{
struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
- unsigned long free_base_pfn = 0;
unsigned long nr_pages = 0;
- unsigned long nr_free = 0;
struct page *page = NULL;
- unsigned long pfn;
- /*
- * First we check if pfn is valid on architectures where it is possible
- * to have holes within pageblock_nr_pages. On systems where it is not
- * possible, this function is optimized out.
- *
- * Then, we check if a current large page is valid by only checking the
- * validity of the head pfn.
- *
- * meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not
- * belong to this memory node.
- *
- * Finally, we minimize pfn page lookups and scheduler checks by
- * performing it only once every pageblock_nr_pages.
- *
- * We do it in two loops: first we initialize struct page, than free to
- * buddy allocator, becuse while we are freeing pages we can access
- * pages that are ahead (computing buddy page in __free_one_page()).
- */
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ page = NULL;
continue;
- if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
- if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- if (page && (pfn & nr_pgmask))
- page++;
- else
- page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zid, nid);
- cond_resched();
- }
- }
- }
-
- page = NULL;
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn)) {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
- } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
- } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
- } else if (page && (pfn & nr_pgmask)) {
- page++;
- nr_free++;
- } else {
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- free_base_pfn = pfn;
- nr_free = 1;
cond_resched();
+ } else {
+ page++;
}
+ __init_single_page(page, pfn, zid, nid, true);
+ nr_pages++;
}
- /* Free the last block of pages to allocator */
- nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
-
- return nr_pages;
+ return (nr_pages);
}
/* Initialise remaining memory on a node */
@@ -1582,10 +1578,21 @@ static int __init deferred_init_memmap(void *data)
}
first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+ /*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, than free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ */
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
+ }
for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_range(nid, zid, spfn, epfn);
+ deferred_free_pages(nid, zid, spfn, epfn);
}
/* Sanity check that the next zone really is unpopulated */
@@ -3391,7 +3398,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_THISNODE)
goto out;
- /* Exhausted what can be done so it's blamo time */
+ /* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
@@ -4272,7 +4279,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
struct page *page;
/*
- * __get_free_pages() returns a 32-bit address, which cannot represent
+ * __get_free_pages() returns a virtual address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
@@ -5393,15 +5400,20 @@ not_early:
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
struct page *page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
+ __init_single_page(page, pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
} else {
- __init_single_pfn(pfn, zone, nid);
+ __init_single_pfn(pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
}
}
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2c16216c29b6..5295ef331165 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,7 +59,9 @@
*/
static struct page_ext_operations *page_ext_ops[] = {
+#ifdef CONFIG_DEBUG_PAGEALLOC
&debug_guardpage_ops,
+#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 270a8219ccd0..9886c6073828 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
- struct page *page;
- struct page_ext *page_ext;
- unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
- unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long count = 0;
- /* Scan block by block. First and last block may be incomplete */
- pfn = zone->zone_start_pfn;
-
/*
* Walk the zone in pageblock_nr_pages steps. If a page block spans
* a zone boundary, it will be double counted between zones. This does
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
+ unsigned long block_end_pfn;
+
if (!pfn_valid(pfn)) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
@@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
-
for (; pfn < block_end_pfn; pfn++) {
+ struct page *page;
+ struct page_ext *page_ext;
+
if (!pfn_valid_within(pfn))
continue;
@@ -635,9 +633,7 @@ static int __init pageowner_init(void)
dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
NULL, &proc_page_owner_operations);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- return 0;
+ return PTR_ERR_OR_ZERO(dentry);
}
late_initcall(pageowner_init)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 1e4ee763c190..cf2af04b34b9 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
#endif
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
- pmd_t entry = *pmdp;
- set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
+ pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return old;
}
#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 7fbe67be86fa..1907688b75ee 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2717,15 +2717,28 @@ continue_resched:
return error;
}
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (file->f_op == &shmem_file_operations)
+ return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+ if (file->f_op == &hugetlbfs_file_operations)
+ return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+ return NULL;
+}
+
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE)
-int shmem_add_seals(struct file *file, unsigned int seals)
+static int memfd_add_seals(struct file *file, unsigned int seals)
{
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
+ unsigned int *file_seals;
int error;
/*
@@ -2758,8 +2771,6 @@ int shmem_add_seals(struct file *file, unsigned int seals)
* other file types.
*/
- if (file->f_op != &shmem_file_operations)
- return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EPERM;
if (seals & ~(unsigned int)F_ALL_SEALS)
@@ -2767,12 +2778,18 @@ int shmem_add_seals(struct file *file, unsigned int seals)
inode_lock(inode);
- if (info->seals & F_SEAL_SEAL) {
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
error = -EPERM;
goto unlock;
}
- if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
error = mapping_deny_writable(file->f_mapping);
if (error)
goto unlock;
@@ -2784,25 +2801,22 @@ int shmem_add_seals(struct file *file, unsigned int seals)
}
}
- info->seals |= seals;
+ *file_seals |= seals;
error = 0;
unlock:
inode_unlock(inode);
return error;
}
-EXPORT_SYMBOL_GPL(shmem_add_seals);
-int shmem_get_seals(struct file *file)
+static int memfd_get_seals(struct file *file)
{
- if (file->f_op != &shmem_file_operations)
- return -EINVAL;
+ unsigned int *seals = memfd_file_seals_ptr(file);
- return SHMEM_I(file_inode(file))->seals;
+ return seals ? *seals : -EINVAL;
}
-EXPORT_SYMBOL_GPL(shmem_get_seals);
-long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
long error;
@@ -2812,10 +2826,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
if (arg > UINT_MAX)
return -EINVAL;
- error = shmem_add_seals(file, arg);
+ error = memfd_add_seals(file, arg);
break;
case F_GET_SEALS:
- error = shmem_get_seals(file);
+ error = memfd_get_seals(file);
break;
default:
error = -EINVAL;
@@ -3657,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
- struct shmem_inode_info *info;
+ unsigned int *file_seals;
struct file *file;
int fd, error;
char *name;
@@ -3667,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create,
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
return -EINVAL;
} else {
- /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
- if (flags & MFD_ALLOW_SEALING)
- return -EINVAL;
/* Allow huge page size encoding in flags. */
if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
@@ -3722,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create,
file->f_flags |= O_RDWR | O_LARGEFILE;
if (flags & MFD_ALLOW_SEALING) {
- /*
- * flags check at beginning of function ensures
- * this is not a hugetlbfs (MFD_HUGETLB) file.
- */
- info = SHMEM_I(file_inode(file));
- info->seals &= ~F_SEAL_SEAL;
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
diff --git a/mm/slab.c b/mm/slab.c
index 4e51ef954026..226906294183 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1316,8 +1316,6 @@ void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
- slab_state = UP;
-
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)
@@ -1353,8 +1351,6 @@ static int __init cpucache_init(void)
slab_online_cpu, slab_offline_cpu);
WARN_ON(ret < 0);
- /* Done! */
- slab_state = FULL;
return 0;
}
__initcall(cpucache_init);
diff --git a/mm/slab.h b/mm/slab.h
index ad657ffa44e5..e8e2095a6185 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -78,9 +78,6 @@ extern const struct kmalloc_info_struct {
unsigned long size;
} kmalloc_info[];
-unsigned long calculate_alignment(slab_flags_t flags,
- unsigned long align, unsigned long size);
-
#ifndef CONFIG_SLOB
/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c8cb36774ba1..deeddf95cdcf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -268,6 +268,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s)
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+static unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign;
+
+ ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+/*
* Find a mergeable slab cache
*/
int slab_unmergeable(struct kmem_cache *s)
@@ -337,33 +366,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
return NULL;
}
-/*
- * Figure out what the alignment of the objects will be given a set of
- * flags, a user specified alignment and the size of the objects.
- */
-unsigned long calculate_alignment(slab_flags_t flags,
- unsigned long align, unsigned long size)
-{
- /*
- * If the user wants hardware cache aligned objects then follow that
- * suggestion if the object is sufficiently large.
- *
- * The hardware cache alignment cannot override the specified
- * alignment though. If that is greater then use it.
- */
- if (flags & SLAB_HWCACHE_ALIGN) {
- unsigned long ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- align = max(align, ralign);
- }
-
- if (align < ARCH_SLAB_MINALIGN)
- align = ARCH_SLAB_MINALIGN;
-
- return ALIGN(align, sizeof(void *));
-}
-
static struct kmem_cache *create_cache(const char *name,
size_t object_size, size_t size, size_t align,
slab_flags_t flags, void (*ctor)(void *),
diff --git a/mm/slub.c b/mm/slub.c
index cfd56e5a35fb..693b7074bc53 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
u8 *start;
u8 *fault;
u8 *end;
+ u8 *pad;
int length;
int remainder;
@@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!remainder)
return 1;
+ pad = end - remainder;
metadata_access_enable();
- fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+ fault = memchr_inv(pad, POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
return 1;
@@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section(KERN_ERR, "Padding ", end - remainder, remainder);
+ print_section(KERN_ERR, "Padding ", pad, remainder);
- restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
+ restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
return 0;
}
@@ -2220,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s,
/*
* Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available. This is done without interrupts disabled and without
- * preemption disabled. The cmpxchg is racy and may put the partial page
- * onto a random cpus partial slot.
+ * slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
diff --git a/mm/sparse.c b/mm/sparse.c
index 2609aba121e8..6b8b5e91ceef 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
*/
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
- return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+ unsigned long coded_mem_map =
+ (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
+ return coded_mem_map;
}
/*
diff --git a/mm/swap.c b/mm/swap.c
index 38e1b6374a97..10568b1548d4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page)
}
/**
- * lru_cache_add: add a page to the page lists
+ * lru_cache_add_anon - add a page to the page lists
* @page: the page to add
*/
void lru_cache_add_anon(struct page *page)
@@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-void lru_add_drain_all_cpuslocked(void)
+/*
+ * Doesn't need any cpu hotplug locking because we do rely on per-cpu
+ * kworkers being shut down before our page_alloc_cpu_dead callback is
+ * executed on the offlined cpu.
+ * Calling this function with cpu hotplug locks held can actually lead
+ * to obscure indirect dependencies via WQ context.
+ */
+void lru_add_drain_all(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
@@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void)
mutex_unlock(&lock);
}
-void lru_add_drain_all(void)
-{
- get_online_cpus();
- lru_add_drain_all_cpuslocked();
- put_online_cpus();
-}
-
/**
* release_pages - batched put_page()
* @pages: array of pages to release
@@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add);
*/
unsigned pagevec_lookup_entries(struct pagevec *pvec,
struct address_space *mapping,
- pgoff_t start, unsigned nr_pages,
+ pgoff_t start, unsigned nr_entries,
pgoff_t *indices)
{
- pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->nr = find_get_entries(mapping, start, nr_entries,
pvec->pages, indices);
return pagevec_count(pvec);
}
@@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index
- * @nr_pages: The maximum number of pages
*
- * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
* pages in the mapping starting from index @start and upto index @end
* (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
@@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
* also update @start to index the next page for the traversal.
*
* pagevec_lookup_range() returns the number of pages which were found. If this
- * number is smaller than @nr_pages, the end of specified range has been
+ * number is smaller than PAGEVEC_SIZE, the end of specified range has been
* reached.
*/
unsigned pagevec_lookup_range(struct pagevec *pvec,
diff --git a/mm/truncate.c b/mm/truncate.c
index e4b4cf0f4070..c34e2fd4f583 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -179,12 +179,8 @@ static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
- loff_t holelen;
-
- holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
- unmap_mapping_range(mapping,
- (loff_t)page->index << PAGE_SHIFT,
- holelen, 0);
+ pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+ unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page))
@@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
/*
* Zap the rest of the file in one hit.
*/
- unmap_mapping_range(mapping,
- (loff_t)index << PAGE_SHIFT,
- (loff_t)(1 + end - index)
- << PAGE_SHIFT,
- 0);
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
- unmap_mapping_range(mapping,
- (loff_t)index << PAGE_SHIFT,
- PAGE_SIZE, 0);
+ unmap_mapping_pages(mapping, index,
+ 1, false);
}
}
BUG_ON(page_mapped(page));
@@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* get remapped later.
*/
if (dax_mapping(mapping)) {
- unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
- (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
+ unmap_mapping_pages(mapping, start, end - start + 1, false);
}
out:
cleancache_invalidate_inode(mapping);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 47d5ced51f2d..fdd3fc6be862 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
return nr;
}
-unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
-{
- unsigned long nr;
-
- nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
- node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
- node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
- node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
- node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
-
- return nr;
-}
-
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
@@ -310,9 +294,7 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
- struct shrinker *shrinker,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+ struct shrinker *shrinker, int priority)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -337,9 +319,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = (4 * nr_scanned) / shrinker->seeks;
- delta *= freeable;
- do_div(delta, nr_eligible + 1);
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
total_scan += delta;
if (total_scan < 0) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -373,8 +355,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_scanned, nr_eligible,
- freeable, delta, total_scan);
+ freeable, delta, total_scan, priority);
/*
* Normally, we should not scan less than batch_size objects in one
@@ -434,8 +415,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
* @memcg: memory cgroup whose slab caches to target
- * @nr_scanned: pressure numerator
- * @nr_eligible: pressure denominator
+ * @priority: the reclaim priority
*
* Call the shrink functions to age shrinkable caches.
*
@@ -447,20 +427,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* objects from the memory cgroup specified. Otherwise, only unaware
* shrinkers are called.
*
- * @nr_scanned and @nr_eligible form a ratio that indicate how much of
- * the available objects should be scanned. Page reclaim for example
- * passes the number of pages scanned and the number of pages on the
- * LRU lists that it considered on @nid, plus a bias in @nr_scanned
- * when it encountered mapped pages. The ratio is further biased by
- * the ->seeks setting of the shrink function, which indicates the
- * cost to recreate an object relative to that of an LRU page.
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
*
* Returns the number of reclaimed slab objects.
*/
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+ int priority)
{
struct shrinker *shrinker;
unsigned long freed = 0;
@@ -468,9 +442,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
- if (nr_scanned == 0)
- nr_scanned = SWAP_CLUSTER_MAX;
-
if (!down_read_trylock(&shrinker_rwsem)) {
/*
* If we would return 0, our callers would understand that we
@@ -501,7 +472,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
sc.nid = 0;
- freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+ freed += do_shrink_slab(&sc, shrinker, priority);
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the regsitration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
}
up_read(&shrinker_rwsem);
@@ -519,8 +499,7 @@ void drop_slab_node(int nid)
freed = 0;
do {
- freed += shrink_slab(GFP_KERNEL, nid, memcg,
- 1000, 1000);
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
} while (freed > 10);
}
@@ -1436,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if (PageDirty(page)) {
struct address_space *mapping;
+ bool migrate_dirty;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
- * without blocking
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
*/
+ if (!trylock_page(page))
+ return ret;
+
mapping = page_mapping(page);
- if (mapping && !mapping->a_ops->migratepage)
+ migrate_dirty = mapping && mapping->a_ops->migratepage;
+ unlock_page(page);
+ if (!migrate_dirty)
return ret;
}
}
@@ -2615,14 +2604,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
-
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
if (memcg)
shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->nr_scanned - scanned,
- lru_pages);
+ memcg, sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -2646,14 +2633,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
- /*
- * Shrink the slab caches in the same proportion that
- * the eligible LRU pages were scanned.
- */
if (global_reclaim(sc))
shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
- sc->nr_scanned - nr_scanned,
- node_lru_pages);
+ sc->priority);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
diff --git a/mm/zpool.c b/mm/zpool.c
index fd3ff719c32c..e1e7aa6d1d06 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -21,6 +21,7 @@ struct zpool {
struct zpool_driver *driver;
void *pool;
const struct zpool_ops *ops;
+ bool evictable;
struct list_head list;
};
@@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool);
*
* This creates a new zpool of the specified type. The gfp flags will be
* used when allocating memory, if the implementation supports it. If the
- * ops param is NULL, then the created zpool will not be shrinkable.
+ * ops param is NULL, then the created zpool will not be evictable.
*
* Implementations must guarantee this to be thread-safe.
*
@@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
zpool->driver = driver;
zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
+ zpool->evictable = driver->shrink && ops && ops->evict;
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
int zpool_shrink(struct zpool *zpool, unsigned int pages,
unsigned int *reclaimed)
{
- return zpool->driver->shrink(zpool->pool, pages, reclaimed);
+ return zpool->driver->shrink ?
+ zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
}
/**
@@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool)
return zpool->driver->total_size(zpool->pool);
}
+/**
+ * zpool_evictable() - Test if zpool is potentially evictable
+ * @pool The zpool to test
+ *
+ * Zpool is only potentially evictable when it's created with struct
+ * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
+ *
+ * However, it doesn't necessarily mean driver will use zpool_ops.evict
+ * in its implementation of zpool_driver.shrink. It could do internal
+ * defragmentation instead.
+ *
+ * Returns: true if potentially evictable; false otherwise.
+ */
+bool zpool_evictable(struct zpool *zpool)
+{
+ return zpool->evictable;
+}
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 683c0651098c..c3013505c305 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -46,6 +46,7 @@
#include <linux/vmalloc.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
+#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
@@ -257,11 +258,7 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
- /*
- * To signify that register_shrinker() was successful
- * and unregister_shrinker() will not Oops.
- */
- bool shrinker_enabled;
+
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -407,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static int zs_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- return -EINVAL;
-}
-
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -450,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = {
.destroy = zs_zpool_destroy,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -1057,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
* Reset OBJ_TAG_BITS bit to last link to tell
* whether it's allocated object or not.
*/
- link->next = -1 << OBJ_TAG_BITS;
+ link->next = -1UL << OBJ_TAG_BITS;
}
kunmap_atomic(vaddr);
page = next_page;
@@ -2324,10 +2314,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
static void zs_unregister_shrinker(struct zs_pool *pool)
{
- if (pool->shrinker_enabled) {
- unregister_shrinker(&pool->shrinker);
- pool->shrinker_enabled = false;
- }
+ unregister_shrinker(&pool->shrinker);
}
static int zs_register_shrinker(struct zs_pool *pool)
@@ -2426,11 +2413,13 @@ struct zs_pool *zs_create_pool(const char *name)
goto err;
/*
- * Not critical, we still can use the pool
- * and user can trigger compaction manually.
+ * Not critical since shrinker is only used to trigger internal
+ * defragmentation of the pool which is pretty optional thing. If
+ * registration fails we still can use the pool normally and user can
+ * trigger compaction manually. Thus, ignore return code.
*/
- if (zs_register_shrinker(pool) == 0)
- pool->shrinker_enabled = true;
+ zs_register_shrinker(pool);
+
return pool;
err:
diff --git a/mm/zswap.c b/mm/zswap.c
index d39581a076c3..c004aa4fd3f4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -49,6 +49,8 @@
static u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+/* The number of same-value filled pages currently stored in zswap */
+static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
+/* Enable/disable handling same-value filled pages (enabled by default) */
+static bool zswap_same_filled_pages_enabled = true;
+module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
+ bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -145,9 +152,10 @@ struct zswap_pool {
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
- * decompression
+ * decompression. For a same value filled page length is 0.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
+ * value - value of the same-value filled pages which have same content
*/
struct zswap_entry {
struct rb_node rbnode;
@@ -155,7 +163,10 @@ struct zswap_entry {
int refcount;
unsigned int length;
struct zswap_pool *pool;
- unsigned long handle;
+ union {
+ unsigned long handle;
+ unsigned long value;
+ };
};
struct zswap_header {
@@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
- zpool_free(entry->pool->zpool, entry->handle);
- zswap_pool_put(entry->pool);
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zpool_free(entry->pool->zpool, entry->handle);
+ zswap_pool_put(entry->pool);
+ }
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
zswap_update_total_size();
@@ -953,6 +968,28 @@ static int zswap_shrink(void)
return ret;
}
+static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
+{
+ unsigned int pos;
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+ for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+ if (page[pos] != page[0])
+ return 0;
+ }
+ *value = page[0];
+ return 1;
+}
+
+static void zswap_fill_page(void *ptr, unsigned long value)
+{
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+ memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
+}
+
/*********************************
* frontswap hooks
**********************************/
@@ -964,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
struct zswap_entry *entry, *dupentry;
struct crypto_comp *tfm;
int ret;
- unsigned int dlen = PAGE_SIZE, len;
- unsigned long handle;
+ unsigned int hlen, dlen = PAGE_SIZE;
+ unsigned long handle, value;
char *buf;
u8 *src, *dst;
- struct zswap_header *zhdr;
+ struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
if (!zswap_enabled || !tree) {
ret = -ENODEV;
@@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ if (zswap_same_filled_pages_enabled) {
+ src = kmap_atomic(page);
+ if (zswap_is_page_same_filled(src, &value)) {
+ kunmap_atomic(src);
+ entry->offset = offset;
+ entry->length = 0;
+ entry->value = value;
+ atomic_inc(&zswap_same_filled_pages);
+ goto insert_entry;
+ }
+ kunmap_atomic(src);
+ }
+
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool) {
@@ -1013,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* store */
- len = dlen + sizeof(struct zswap_header);
- ret = zpool_malloc(entry->pool->zpool, len,
+ hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
__GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
&handle);
if (ret == -ENOSPC) {
@@ -1025,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
- zhdr->swpentry = swp_entry(type, offset);
- buf = (u8 *)(zhdr + 1);
- memcpy(buf, dst, dlen);
+ buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
+ memcpy(buf, &zhdr, hlen);
+ memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
put_cpu_var(zswap_dstmem);
@@ -1037,6 +1086,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
entry->handle = handle;
entry->length = dlen;
+insert_entry:
/* map */
spin_lock(&tree->lock);
do {
@@ -1089,10 +1139,18 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
}
spin_unlock(&tree->lock);
+ if (!entry->length) {
+ dst = kmap_atomic(page);
+ zswap_fill_page(dst, entry->value);
+ kunmap_atomic(dst);
+ goto freeentry;
+ }
+
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
- ZPOOL_MM_RO) + sizeof(struct zswap_header);
+ src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+ if (zpool_evictable(entry->pool->zpool))
+ src += sizeof(struct zswap_header);
dst = kmap_atomic(page);
tfm = *get_cpu_ptr(entry->pool->tfm);
ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
@@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
+freeentry:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
@@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_pool_total_size);
debugfs_create_atomic_t("stored_pages", S_IRUGO,
zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_atomic_t("same_filled_pages", 0444,
+ zswap_debugfs_root, &zswap_same_filled_pages);
return 0;
}