summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-22 09:04:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-22 09:04:48 -0700
commit95211279c5ad00a317c98221d7e4365e02f20836 (patch)
tree2ddc8625378d2915b8c96392f3cf6663b705ed55 /mm
parent5375871d432ae9fc581014ac117b96aaee3cd0c7 (diff)
parent12724850e8064f64b6223d26d78c0597c742c65a (diff)
downloadlinux-95211279c5ad00a317c98221d7e4365e02f20836.tar.bz2
Merge branch 'akpm' (Andrew's patch-bomb)
Merge first batch of patches from Andrew Morton: "A few misc things and all the MM queue" * emailed from Andrew Morton <akpm@linux-foundation.org>: (92 commits) memcg: avoid THP split in task migration thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE memcg: clean up existing move charge code mm/memcontrol.c: remove unnecessary 'break' in mem_cgroup_read() mm/memcontrol.c: remove redundant BUG_ON() in mem_cgroup_usage_unregister_event() mm/memcontrol.c: s/stealed/stolen/ memcg: fix performance of mem_cgroup_begin_update_page_stat() memcg: remove PCG_FILE_MAPPED memcg: use new logic for page stat accounting memcg: remove PCG_MOVE_LOCK flag from page_cgroup memcg: simplify move_account() check memcg: remove EXPORT_SYMBOL(mem_cgroup_update_page_stat) memcg: kill dead prev_priority stubs memcg: remove PCG_CACHE page_cgroup flag memcg: let css_get_next() rely upon rcu_read_lock() cgroup: revert ss_id_lock to spinlock idr: make idr_get_next() good for rcu_read_lock() memcg: remove unnecessary thp check in page stat accounting memcg: remove redundant returns memcg: enum lru_list lru ...
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c5
-rw-r--r--mm/compaction.c77
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/huge_memory.c125
-rw-r--r--mm/hugetlb.c184
-rw-r--r--mm/ksm.c34
-rw-r--r--mm/memcontrol.c473
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c53
-rw-r--r--mm/mempolicy.c62
-rw-r--r--mm/migrate.c36
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c51
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/oom_kill.c166
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c58
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c70
-rw-r--r--mm/shmem.c88
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c40
-rw-r--r--mm/sparse.c30
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c24
-rw-r--r--mm/swapfile.c58
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmscan.c151
30 files changed, 1108 insertions, 771 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 668e94df8cf2..0131170c9d54 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
bootmem_data_t *bdata;
- unsigned long pfn, goal, limit;
+ unsigned long pfn, goal;
pfn = section_nr_to_pfn(section_nr);
goal = pfn << PAGE_SHIFT;
- limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
- return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+ return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
}
#endif
diff --git a/mm/compaction.c b/mm/compaction.c
index d9ebebe1a2aa..74a8c825ff28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,7 +35,7 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
- unsigned int order; /* order a direct compactor needs */
+ int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
};
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact all zones within a node */
-static int compact_node(int nid)
+static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
{
int zoneid;
- pg_data_t *pgdat;
struct zone *zone;
- if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
- return -EINVAL;
- pgdat = NODE_DATA(nid);
-
- /* Flush pending updates to the LRU lists */
- lru_add_drain_all();
-
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
- struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .order = -1,
- .sync = true,
- };
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
- compact_zone(zone, &cc);
+ cc->nr_freepages = 0;
+ cc->nr_migratepages = 0;
+ cc->zone = zone;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
+ if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+ compact_zone(zone, cc);
+
+ if (cc->order > 0) {
+ int ok = zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0);
+ if (ok && cc->order > zone->compact_order_failed)
+ zone->compact_order_failed = cc->order + 1;
+ /* Currently async compaction is never deferred. */
+ else if (!ok && cc->sync)
+ defer_compaction(zone, cc->order);
+ }
- VM_BUG_ON(!list_empty(&cc.freepages));
- VM_BUG_ON(!list_empty(&cc.migratepages));
+ VM_BUG_ON(!list_empty(&cc->freepages));
+ VM_BUG_ON(!list_empty(&cc->migratepages));
}
return 0;
}
+int compact_pgdat(pg_data_t *pgdat, int order)
+{
+ struct compact_control cc = {
+ .order = order,
+ .sync = false,
+ };
+
+ return __compact_pgdat(pgdat, &cc);
+}
+
+static int compact_node(int nid)
+{
+ struct compact_control cc = {
+ .order = -1,
+ .sync = true,
+ };
+
+ return __compact_pgdat(NODE_DATA(nid), &cc);
+}
+
/* Compact all nodes in the system */
static int compact_nodes(void)
{
int nid;
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
for_each_online_node(nid)
compact_node(nid);
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- compact_node(dev->id);
+ int nid = dev->id;
+
+ if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ compact_node(nid);
+ }
return count;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f8165075a5a..843042045dc9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,9 +101,8 @@
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * (code doesn't rely on that order, so you could switch it around)
- * ->tasklist_lock (memory_failure, collect_procs_ao)
- * ->i_mmap_mutex
+ * ->i_mmap_mutex
+ * ->tasklist_lock (memory_failure, collect_procs_ao)
*/
/*
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
struct page *page;
if (cpuset_do_page_mem_spread()) {
- get_mems_allowed();
- n = cpuset_mem_spread_node();
- page = alloc_pages_exact_node(n, gfp, 0);
- put_mems_allowed();
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
return page;
}
return alloc_pages(gfp, 0);
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
struct page *page;
gfp_t gfp_notmask = 0;
- gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
+ gfp_mask = mapping_gfp_mask(mapping);
+ if (mapping_cap_account_dirty(mapping))
+ gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
gfp_notmask = __GFP_FS;
repeat:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8f7fc394f636..f0e5306eeb55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
{
int ret = 0;
- spin_lock(&tlb->mm->page_table_lock);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&tlb->mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma,
- pmd);
- } else {
- struct page *page;
- pgtable_t pgtable;
- pgtable = get_pmd_huge_pte(tlb->mm);
- page = pmd_page(*pmd);
- pmd_clear(pmd);
- tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
- pte_free(tlb->mm, pgtable);
- ret = 1;
- }
- } else
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ struct page *page;
+ pgtable_t pgtable;
+ pgtable = get_pmd_huge_pte(tlb->mm);
+ page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
-
+ tlb_remove_page(tlb, page);
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
return ret;
}
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
int ret = 0;
- spin_lock(&vma->vm_mm->page_table_lock);
- if (likely(pmd_trans_huge(*pmd))) {
- ret = !pmd_trans_splitting(*pmd);
- spin_unlock(&vma->vm_mm->page_table_lock);
- if (unlikely(!ret))
- wait_split_huge_page(vma->anon_vma, pmd);
- else {
- /*
- * All logical pages in the range are present
- * if backed by a huge page.
- */
- memset(vec, 1, (end - addr) >> PAGE_SHIFT);
- }
- } else
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
spin_unlock(&vma->vm_mm->page_table_lock);
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ ret = 1;
+ }
return ret;
}
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- spin_lock(&mm->page_table_lock);
- if (likely(pmd_trans_huge(*old_pmd))) {
- if (pmd_trans_splitting(*old_pmd)) {
- spin_unlock(&mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma, old_pmd);
- ret = -1;
- } else {
- pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
- VM_BUG_ON(!pmd_none(*new_pmd));
- set_pmd_at(mm, new_addr, new_pmd, pmd);
- spin_unlock(&mm->page_table_lock);
- ret = 1;
- }
- } else {
+ ret = __pmd_trans_huge_lock(old_pmd, vma);
+ if (ret == 1) {
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
spin_unlock(&mm->page_table_lock);
}
out:
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
- spin_lock(&mm->page_table_lock);
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ pmd_t entry;
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmd_modify(entry, newprot);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * Returns 1 if a given pmd maps a stable (not under splitting) thp.
+ * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ *
+ * Note that if it returns 1, this routine returns without unlocking page
+ * table locks. So callers must unlock them.
+ */
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+{
+ spin_lock(&vma->vm_mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
+ return -1;
} else {
- pmd_t entry;
-
- entry = pmdp_get_and_clear(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
- ret = 1;
+ /* Thp mapped by 'pmd' is stable, so we can
+ * handle it as it is. */
+ return 1;
}
- } else
- spin_unlock(&vma->vm_mm->page_table_lock);
-
- return ret;
+ }
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
}
pmd_t *page_check_address_pmd(struct page *page,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a876871f6be5..afa057a1d3fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
*/
static DEFINE_SPINLOCK(hugetlb_lock);
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
+
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
+ }
+ spin_unlock(&spool->lock);
+
+ return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
+
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- struct page *page = NULL;
+ struct page *page;
struct mempolicy *mpol;
nodemask_t *nodemask;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
}
}
}
-err:
+
mpol_cond_put(mpol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
+
+err:
+ mpol_cond_put(mpol);
+ return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct address_space *mapping;
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
- mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- if (mapping)
- hugetlb_put_quota(mapping, 1);
+ hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
struct page *page, *tmp;
int ret, i;
int needed, allocated;
+ bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page)
- /*
- * We were not able to allocate enough pages to
- * satisfy the entire reservation so we free what
- * we've allocated so far.
- */
- goto free;
-
+ if (!page) {
+ alloc_ok = false;
+ break;
+ }
list_add(&page->lru, &surplus_list);
}
- allocated += needed;
+ allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
spin_lock(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
- if (needed > 0)
- goto retry;
-
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
/*
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accommodate the reservation. Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
+free:
spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
-free:
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
/*
* Determine if the huge page at addr within the vma has an associated
* reservation. Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
- * Where any new reservation would be required the reservation change is
- * prepared, but not committed. Once the page has been quota'd allocated
- * an instantiated the change should be committed via vma_commit_reservation.
- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
*/
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
+ struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
long chg;
/*
- * Processes that did not create the mapping will have no reserves and
- * will not have accounted against quota. Check that the quota can be
- * made before satisfying the allocation
- * MAP_NORESERVE mappings may also need pages and quota allocated
- * if no reserve mapping overlaps.
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
return ERR_PTR(-VM_FAULT_OOM);
if (chg)
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ if (hugepage_subpool_get_pages(spool, chg))
return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (!page) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
- hugetlb_put_quota(inode->i_mapping, chg);
+ hugepage_subpool_put_pages(spool, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_private(page, (unsigned long) mapping);
+ set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *reservations = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
if (reserve) {
hugetlb_acct_memory(h, -reserve);
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ hugepage_subpool_put_pages(spool, reserve);
}
}
}
@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (pte_dirty(pte))
set_page_dirty(page);
list_add(&page->lru, &page_list);
+
+ /* Bail out after unmapping reference page if supplied */
+ if (ref_page)
+ break;
}
flush_tlb_range(vma, start, end);
spin_unlock(&mm->page_table_lock);
@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
address = address & huge_page_mask(h);
pgoff = vma_hugecache_offset(h, vma, address);
- mapping = (struct address_space *)page_private(page);
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
- * and filesystem quota without using reserves
+ * without using reserves
*/
if (vm_flags & VM_NORESERVE)
return 0;
@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
if (chg < 0)
return chg;
- /* There must be enough filesystem quota for the mapping */
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg))
return -ENOSPC;
/*
* Check enough hugepages are available for the reservation.
- * Hand back the quota if there are not
+ * Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
- hugetlb_put_quota(inode->i_mapping, chg);
+ hugepage_subpool_put_pages(spool, chg);
return ret;
}
@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct hugepage_subpool *spool = subpool_inode(inode);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
+ hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
diff --git a/mm/ksm.c b/mm/ksm.c
index a6d3fb7e6c10..47c885368890 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ if (ksm_test_exit(mm))
+ return NULL;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ return NULL;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ return NULL;
+ return vma;
+}
+
static void break_cow(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
- goto out;
- break_ksm(vma, addr);
-out:
+ vma = find_mergeable_vma(mm, addr);
+ if (vma)
+ break_ksm(vma, addr);
up_read(&mm->mmap_sem);
}
@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct page *page;
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ vma = find_mergeable_vma(mm, addr);
+ if (!vma)
goto out;
page = follow_page(vma, addr, FOLL_GET);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26c6f4ec20f4..b2ee6df0e9bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
- MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
MEM_CGROUP_STAT_NSTATS,
};
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
*/
struct mem_cgroup_per_zone {
struct lruvec lruvec;
- unsigned long count[NR_LRU_LISTS];
+ unsigned long lru_size[NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
unsigned long long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
- struct mem_cgroup *mem; /* Back pointer, we cannot */
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
-/* Macro for accessing counter */
-#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -300,6 +297,12 @@ struct mem_cgroup {
*/
unsigned long move_charge_at_immigrate;
/*
+ * set > 0 if pages under this cgroup are moving to other cgroup.
+ */
+ atomic_t moving_account;
+ /* taken only while moving_account > 0 */
+ spinlock_t move_lock;
+ /*
* percpu counter.
*/
struct mem_cgroup_stat_cpu *stat;
@@ -612,9 +615,9 @@ retry:
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
- __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
- if (!res_counter_soft_limit_excess(&mz->mem->res) ||
- !css_tryget(&mz->mem->css))
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+ !css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
- bool file, int nr_pages)
+ bool anon, int nr_pages)
{
preempt_disable();
- if (file)
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+ /*
+ * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
+ * counted as CACHE even if it's on ANON LRU.
+ */
+ if (anon)
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
nr_pages);
else
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
/* pagein of a big page is an event. So, ignore page size */
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
- enum lru_list l;
+ enum lru_list lru;
unsigned long ret = 0;
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
- for_each_lru(l) {
- if (BIT(l) & lru_mask)
- ret += MEM_CGROUP_ZSTAT(mz, l);
+ for_each_lru(lru) {
+ if (BIT(lru) & lru_mask)
+ ret += mz->lru_size[lru];
}
return ret;
}
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
mz = page_cgroup_zoneinfo(memcg, page);
/* compound_order() is stabilized through lru_lock */
- MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
+ mz->lru_size[lru] += 1 << compound_order(page);
return &mz->lruvec;
}
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
VM_BUG_ON(!memcg);
mz = page_cgroup_zoneinfo(memcg, page);
/* huge page split is done under lru_lock. so, we have no races. */
- VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
- MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
+ VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
+ mz->lru_size[lru] -= 1 << compound_order(page);
}
void mem_cgroup_lru_del(struct page *page)
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return memcg->swappiness;
}
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
-{
- int cpu;
+/*
+ * memcg->moving_account is used for checking possibility that some thread is
+ * calling move_account(). When a thread on CPU-A starts moving pages under
+ * a memcg, other threads should check memcg->moving_account under
+ * rcu_read_lock(), like this:
+ *
+ * CPU-A CPU-B
+ * rcu_read_lock()
+ * memcg->moving_account+1 if (memcg->mocing_account)
+ * take heavy locks.
+ * synchronize_rcu() update something.
+ * rcu_read_unlock()
+ * start move here.
+ */
- get_online_cpus();
- spin_lock(&memcg->pcp_counter_lock);
- for_each_online_cpu(cpu)
- per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
- memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
- spin_unlock(&memcg->pcp_counter_lock);
- put_online_cpus();
+/* for quick checking without looking up memcg */
+atomic_t memcg_moving __read_mostly;
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
+{
+ atomic_inc(&memcg_moving);
+ atomic_inc(&memcg->moving_account);
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *memcg)
{
- int cpu;
-
- if (!memcg)
- return;
- get_online_cpus();
- spin_lock(&memcg->pcp_counter_lock);
- for_each_online_cpu(cpu)
- per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
- memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
- spin_unlock(&memcg->pcp_counter_lock);
- put_online_cpus();
+ /*
+ * Now, mem_cgroup_clear_mc() may call this function with NULL.
+ * We check NULL in callee rather than caller.
+ */
+ if (memcg) {
+ atomic_dec(&memcg_moving);
+ atomic_dec(&memcg->moving_account);
+ }
}
+
/*
* 2 routines for checking "mem" is under move_account() or not.
*
- * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
- * for avoiding race in accounting. If true,
+ * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
+ * is used for avoiding races in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
* waiting at hith-memory prressure caused by "move".
*/
-static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
+static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
{
VM_BUG_ON(!rcu_read_lock_held());
- return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+ return atomic_read(&memcg->moving_account) > 0;
}
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
+/*
+ * Take this lock when
+ * - a code tries to modify page's memcg while it's USED.
+ * - a code tries to modify page state accounting in a memcg.
+ * see mem_cgroup_stolen(), too.
+ */
+static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
+ unsigned long *flags)
+{
+ spin_lock_irqsave(&memcg->move_lock, *flags);
+}
+
+static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
+ unsigned long *flags)
+{
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
+}
+
/**
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
if (!memcg || !p)
return;
-
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
- struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
- *oom_wait_memcg;
+ struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
+ struct mem_cgroup *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
- oom_wait_memcg = oom_wait_info->mem;
+ oom_wait_memcg = oom_wait_info->memcg;
/*
- * Both of oom_wait_info->mem and wake_mem are stable under us.
+ * Both of oom_wait_info->memcg and wake_memcg are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
struct oom_wait_info owait;
bool locked, need_to_kill;
- owait.mem = memcg;
+ owait.memcg = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, mask);
+ mem_cgroup_out_of_memory(memcg, mask, order);
} else {
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
- * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
- * possibility of race condition. If there is, we take a lock.
+ * small, we check mm->moving_account and detect there are possibility of race
+ * If there is, we take a lock.
*/
+void __mem_cgroup_begin_update_page_stat(struct page *page,
+ bool *locked, unsigned long *flags)
+{
+ struct mem_cgroup *memcg;
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup(page);
+again:
+ memcg = pc->mem_cgroup;
+ if (unlikely(!memcg || !PageCgroupUsed(pc)))
+ return;
+ /*
+ * If this memory cgroup is not under account moving, we don't
+ * need to take move_lock_page_cgroup(). Because we already hold
+ * rcu_read_lock(), any calls to move_account will be delayed until
+ * rcu_read_unlock() if mem_cgroup_stolen() == true.
+ */
+ if (!mem_cgroup_stolen(memcg))
+ return;
+
+ move_lock_mem_cgroup(memcg, flags);
+ if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
+ move_unlock_mem_cgroup(memcg, flags);
+ goto again;
+ }
+ *locked = true;
+}
+
+void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ /*
+ * It's guaranteed that pc->mem_cgroup never changes while
+ * lock is held because a routine modifies pc->mem_cgroup
+ * should take move_lock_page_cgroup().
+ */
+ move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+}
+
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc = lookup_page_cgroup(page);
- bool need_unlock = false;
unsigned long uninitialized_var(flags);
if (mem_cgroup_disabled())
return;
- rcu_read_lock();
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
- goto out;
- /* pc->mem_cgroup is unstable ? */
- if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
- /* take a lock against to access pc->mem_cgroup */
- move_lock_page_cgroup(pc, &flags);
- need_unlock = true;
- memcg = pc->mem_cgroup;
- if (!memcg || !PageCgroupUsed(pc))
- goto out;
- }
+ return;
switch (idx) {
case MEMCG_NR_FILE_MAPPED:
- if (val > 0)
- SetPageCgroupFileMapped(pc);
- else if (!page_mapped(page))
- ClearPageCgroupFileMapped(pc);
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
}
this_cpu_add(memcg->stat->count[idx], val);
-
-out:
- if (unlikely(need_unlock))
- move_unlock_page_cgroup(pc, &flags);
- rcu_read_unlock();
- return;
}
-EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
per_cpu(memcg->stat->events[i], cpu) = 0;
memcg->nocpu_base.events[i] += x;
}
- /* need to clear ON_MOVE value, works as a kind of lock. */
- per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
- spin_unlock(&memcg->pcp_counter_lock);
-}
-
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
-{
- int idx = MEM_CGROUP_ON_MOVE;
-
- spin_lock(&memcg->pcp_counter_lock);
- per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
spin_unlock(&memcg->pcp_counter_lock);
}
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
struct memcg_stock_pcp *stock;
struct mem_cgroup *iter;
- if ((action == CPU_ONLINE)) {
- for_each_mem_cgroup(iter)
- synchronize_mem_cgroup_on_move(iter, cpu);
+ if (action == CPU_ONLINE)
return NOTIFY_OK;
- }
if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (!oom_check)
return CHARGE_NOMEM;
/* check OOM */
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
return CHARGE_OOM_DIE;
return CHARGE_RETRY;
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
{
struct zone *uninitialized_var(zone);
bool was_on_lru = false;
+ bool anon;
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
* See mem_cgroup_add_lru_list(), etc.
*/
smp_wmb();
- switch (ctype) {
- case MEM_CGROUP_CHARGE_TYPE_CACHE:
- case MEM_CGROUP_CHARGE_TYPE_SHMEM:
- SetPageCgroupCache(pc);
- SetPageCgroupUsed(pc);
- break;
- case MEM_CGROUP_CHARGE_TYPE_MAPPED:
- ClearPageCgroupCache(pc);
- SetPageCgroupUsed(pc);
- break;
- default:
- break;
- }
+ SetPageCgroupUsed(pc);
if (lrucare) {
if (was_on_lru) {
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
spin_unlock_irq(&zone->lru_lock);
}
- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+ anon = true;
+ else
+ anon = false;
+
+ mem_cgroup_charge_statistics(memcg, anon, nr_pages);
unlock_page_cgroup(pc);
/*
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
- (1 << PCG_MIGRATION))
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
{
unsigned long flags;
int ret;
+ bool anon = PageAnon(page);
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(page));
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
goto unlock;
- move_lock_page_cgroup(pc, &flags);
+ move_lock_mem_cgroup(from, &flags);
- if (PageCgroupFileMapped(pc)) {
+ if (!anon && page_mapped(page)) {
/* Update mapped_file data for mem_cgroup */
preempt_disable();
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
- mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
+ mem_cgroup_charge_statistics(from, anon, -nr_pages);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
__mem_cgroup_cancel_charge(from, nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
+ mem_cgroup_charge_statistics(to, anon, nr_pages);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
* guaranteed that "to" is never removed. So, we don't check rmdir
* status here.
*/
- move_unlock_page_cgroup(pc, &flags);
+ move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
unlock_page_cgroup(pc);
@@ -2914,7 +2944,6 @@ direct_uncharge:
res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
if (unlikely(batch->memcg != memcg))
memcg_oom_recover(memcg);
- return;
}
/*
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
struct page_cgroup *pc;
+ bool anon;
if (mem_cgroup_disabled())
return NULL;
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (!PageCgroupUsed(pc))
goto unlock_out;
+ anon = PageAnon(page);
+
switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+ /*
+ * Generally PageAnon tells if it's the anon statistics to be
+ * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
+ * used before page reached the stage of being marked PageAnon.
+ */
+ anon = true;
+ /* fallthrough */
case MEM_CGROUP_CHARGE_TYPE_DROP:
/* See mem_cgroup_prepare_migration() */
if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
+ mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
{
struct page *used, *unused;
struct page_cgroup *pc;
+ bool anon;
if (!memcg)
return;
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
-
- __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
+ anon = PageAnon(used);
+ __mem_cgroup_uncharge_common(unused,
+ anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
+ : MEM_CGROUP_CHARGE_TYPE_CACHE);
/*
* If a page is a file cache, radix-tree replacement is very atomic
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
* and USED bit check in mem_cgroup_uncharge_page() will do enough
* check. (see prepare_charge() also)
*/
- if (PageAnon(used))
+ if (anon)
mem_cgroup_uncharge_page(used);
/*
* At migration, we may charge account against cgroup which has no
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
/* fix accounting on old pages */
lock_page_cgroup(pc);
memcg = pc->mem_cgroup;
- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+ mem_cgroup_charge_statistics(memcg, false, -1);
ClearPageCgroupUsed(pc);
unlock_page_cgroup(pc);
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
nr_scanned = 0;
- reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
if (next_mz == mz)
- css_put(&next_mz->mem->css);
+ css_put(&next_mz->memcg->css);
else /* next_mz == NULL or other memcg */
break;
} while (1);
}
- __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
- excess = res_counter_soft_limit_excess(&mz->mem->res);
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->memcg->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* term TODO.
*/
/* If excess == 0, no tree ops */
- __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);
- css_put(&mz->mem->css);
+ css_put(&mz->memcg->css);
loop++;
/*
* Could not reclaim anything and there are no more
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
} while (!nr_reclaimed);
if (next_mz)
- css_put(&next_mz->mem->css);
+ css_put(&next_mz->memcg->css);
return nr_reclaimed;
}
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
mz = mem_cgroup_zoneinfo(memcg, node, zid);
list = &mz->lruvec.lists[lru];
- loop = MEM_CGROUP_ZSTAT(mz, lru);
+ loop = mz->lru_size[lru];
/* give some margin against EBUSY etc...*/
loop += 256;
busy = NULL;
@@ -3703,10 +3745,10 @@ move_account:
mem_cgroup_start_move(memcg);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
- enum lru_list l;
- for_each_lru(l) {
+ enum lru_list lru;
+ for_each_lru(lru) {
ret = mem_cgroup_force_empty_list(memcg,
- node, zid, l);
+ node, zid, lru);
if (ret)
break;
}
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
break;
default:
BUG();
- break;
}
return val;
}
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
out:
*mem_limit = min_limit;
*memsw_limit = min_memsw_limit;
- return;
}
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
unsigned long node_nr;
struct cgroup *cont = m->private;
- struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
+ total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
seq_printf(m, "total=%lu", total_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
- file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
+ file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
seq_printf(m, "file=%lu", file_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_FILE);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
- anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
+ anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
seq_printf(m, "anon=%lu", anon_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_ANON);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
- unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
+ unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
seq_printf(m, "unevictable=%lu", unevictable_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
BIT(LRU_UNEVICTABLE));
seq_printf(m, " N%d=%lu", nid, node_nr);
}
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
- struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
struct mcs_total_stat mystat;
int i;
memset(&mystat, 0, sizeof(mystat));
- mem_cgroup_get_local_stat(mem_cont, &mystat);
+ mem_cgroup_get_local_stat(memcg, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
/* Hierarchical information */
{
unsigned long long limit, memsw_limit;
- memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
+ memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
cb->fill(cb, "hierarchical_memory_limit", limit);
if (do_swap_account)
cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
}
memset(&mystat, 0, sizeof(mystat));
- mem_cgroup_get_total_stat(mem_cont, &mystat);
+ mem_cgroup_get_total_stat(memcg, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
recent_rotated[0] +=
mz->reclaim_stat.recent_rotated[0];
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
else
BUG();
- /*
- * Something went wrong if we trying to unregister a threshold
- * if we don't have thresholds
- */
- BUG_ON(!thresholds);
-
if (!thresholds->primary)
goto unlock;
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
- enum lru_list l;
+ enum lru_list lru;
int zone, tmp = node;
/*
* This routine is called against possible nodes.
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
- for_each_lru(l)
- INIT_LIST_HEAD(&mz->lruvec.lists[l]);
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
mz->usage_in_excess = 0;
mz->on_tree = false;
- mz->mem = memcg;
+ mz->memcg = memcg;
}
memcg->info.nodeinfo[node] = pn;
return 0;
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
int size = sizeof(struct mem_cgroup);
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kzalloc(size, GFP_KERNEL);
+ memcg = kzalloc(size, GFP_KERNEL);
else
- mem = vzalloc(size);
+ memcg = vzalloc(size);
- if (!mem)
+ if (!memcg)
return NULL;
- mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!mem->stat)
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
goto out_free;
- spin_lock_init(&mem->pcp_counter_lock);
- return mem;
+ spin_lock_init(&memcg->pcp_counter_lock);
+ return memcg;
out_free:
if (size < PAGE_SIZE)
- kfree(mem);
+ kfree(memcg);
else
- vfree(mem);
+ vfree(memcg);
return NULL;
}
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont)
atomic_set(&memcg->refcnt, 1);
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
+ spin_lock_init(&memcg->move_lock);
return &memcg->css;
free_out:
__mem_cgroup_free(memcg);
@@ -5075,7 +5110,7 @@ one_by_one:
}
/**
- * is_target_pte_for_mc - check a pte whether it is valid for move charge
+ * get_mctgt_type - get target type of moving charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
@@ -5098,7 +5133,7 @@ union mc_target {
};
enum mc_target_type {
- MC_TARGET_NONE, /* not used */
+ MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
};
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return page;
}
-static int is_target_pte_for_mc(struct vm_area_struct *vma,
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
- int ret = 0;
+ enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
- return 0;
+ return ret;
if (page) {
pc = lookup_page_cgroup(page);
/*
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider swapping or file mapped pages because THP does not
+ * support them for now.
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
+ */
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ struct page *page = NULL;
+ struct page_cgroup *pc;
+ enum mc_target_type ret = MC_TARGET_NONE;
+
+ page = pmd_page(pmd);
+ VM_BUG_ON(!page || !PageHead(page));
+ if (!move_anon())
+ return ret;
+ pc = lookup_page_cgroup(page);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (target) {
+ get_page(page);
+ target->page = page;
+ }
+ }
+ return ret;
+}
+#else
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ return MC_TARGET_NONE;
+}
+#endif
+
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(walk->mm, pmd);
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+ mc.precharge += HPAGE_PMD_NR;
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+ }
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
- if (is_target_pte_for_mc(vma, addr, *pte, NULL))
+ if (get_mctgt_type(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
+ enum mc_target_type target_type;
+ union mc_target target;
+ struct page *page;
+ struct page_cgroup *pc;
+
+ /*
+ * We don't take compound_lock() here but no race with splitting thp
+ * happens because:
+ * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
+ * under splitting, which means there's no concurrent thp split,
+ * - if another thread runs into split_huge_page() just after we
+ * entered this if-block, the thread must wait for page table lock
+ * to be unlocked in __split_huge_page_splitting(), where the main
+ * part of thp split is not executed yet.
+ */
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (!mc.precharge) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+ }
+ target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+ if (target_type == MC_TARGET_PAGE) {
+ page = target.page;
+ if (!isolate_lru_page(page)) {
+ pc = lookup_page_cgroup(page);
+ if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ pc, mc.from, mc.to,
+ false)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ putback_lru_page(page);
+ }
+ put_page(page);
+ }
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+ }
- split_huge_page_pmd(walk->mm, pmd);
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
- union mc_target target;
- int type;
- struct page *page;
- struct page_cgroup *pc;
swp_entry_t ent;
if (!mc.precharge)
break;
- type = is_target_pte_for_mc(vma, addr, ptent, &target);
- switch (type) {
+ switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_PAGE:
page = target.page;
if (isolate_lru_page(page))
@@ -5417,7 +5524,7 @@ retry:
mc.moved_charge++;
}
putback_lru_page(page);
-put: /* is_target_pte_for_mc() gets the page */
+put: /* get_mctgt_type() gets the page */
put_page(page);
break;
case MC_TARGET_SWAP:
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56080ea36140..c22076ffdd44 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageHuge(p) && !PageTransCompound(p)) {
+ if (!PageHuge(p) && !PageTransTail(p)) {
if (!PageLRU(p))
shake_page(p, 0);
if (!PageLRU(p)) {
diff --git a/mm/memory.c b/mm/memory.c
index 8438c157e4d9..3416b6e018d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
#if defined(SPLIT_RSS_COUNTING)
-static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+void sync_mm_rss(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (task->rss_stat.count[i]) {
- add_mm_counter(mm, i, task->rss_stat.count[i]);
- task->rss_stat.count[i] = 0;
+ if (current->rss_stat.count[i]) {
+ add_mm_counter(mm, i, current->rss_stat.count[i]);
+ current->rss_stat.count[i] = 0;
}
}
- task->rss_stat.events = 0;
+ current->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- __sync_task_rss_stat(task, task->mm);
-}
-
-unsigned long get_mm_counter(struct mm_struct *mm, int member)
-{
- long val = 0;
-
- /*
- * Don't use task->mm here...for avoiding to use task_get_mm()..
- * The caller must guarantee task->mm is not invalid.
- */
- val = atomic_long_read(&mm->rss_stat.count[member]);
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- return 0;
- return (unsigned long)val;
-}
-
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
-{
- __sync_task_rss_stat(task, mm);
+ sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
int i;
if (current->mm == mm)
- sync_mm_rss(current, mm);
+ sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
- if (next-addr != HPAGE_PMD_SIZE) {
+ if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
- continue;
+ goto next;
/* fall through */
}
- if (pmd_none_or_clear_bad(pmd))
- continue;
+ /*
+ * Here there can be other concurrent MADV_DONTNEED or
+ * trans huge page faults running, and if the pmd is
+ * none or trans huge it can change under us. This is
+ * because MADV_DONTNEED holds the mmap_sem in read
+ * mode.
+ */
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
cond_resched();
} while (pmd++, addr = next, addr != end);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 47296fee23db..cfb6c8678754 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
next = pmd_addr_end(addr, end);
split_huge_page_pmd(vma->vm_mm, pmd);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
flags, private))
@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
err = -ESRCH;
goto out;
}
- mm = get_task_mm(task);
- rcu_read_unlock();
+ get_task_struct(task);
err = -EINVAL;
- if (!mm)
- goto out;
/*
* Check if this process has the right to modify the specified
@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
rcu_read_unlock();
err = -EPERM;
- goto out;
+ goto out_put;
}
rcu_read_unlock();
@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
- goto out;
+ goto out_put;
}
if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
- goto out;
+ goto out_put;
}
err = security_task_movememory(task);
if (err)
- goto out;
+ goto out_put;
- err = do_migrate_pages(mm, old, new,
- capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
-out:
+ mm = get_task_mm(task);
+ put_task_struct(task);
if (mm)
- mmput(mm);
+ err = do_migrate_pages(mm, old, new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+ else
+ err = -EINVAL;
+
+ mmput(mm);
+out:
NODEMASK_SCRATCH_FREE(scratch);
return err;
+
+out_put:
+ put_task_struct(task);
+ goto out;
+
}
@@ -1844,18 +1850,24 @@ struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct mempolicy *pol;
struct zonelist *zl;
struct page *page;
+ unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+ cpuset_mems_cookie = get_mems_allowed();
- get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
zl = policy_zonelist(gfp, pol, node);
@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
/*
@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
*/
page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
struct page *page;
+ unsigned int cpuset_mems_cookie;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 1503b6b54ecb..51c08a0c6f68 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1174,20 +1174,17 @@ set_status:
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
*/
-static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
unsigned long nr_pages,
const void __user * __user *pages,
const int __user *nodes,
int __user *status, int flags)
{
struct page_to_node *pm;
- nodemask_t task_nodes;
unsigned long chunk_nr_pages;
unsigned long chunk_start;
int err;
- task_nodes = cpuset_mems_allowed(task);
-
err = -ENOMEM;
pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
if (!pm)
@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
struct task_struct *task;
struct mm_struct *mm;
int err;
+ nodemask_t task_nodes;
/* Check flags */
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
rcu_read_unlock();
return -ESRCH;
}
- mm = get_task_mm(task);
- rcu_read_unlock();
-
- if (!mm)
- return -EINVAL;
+ get_task_struct(task);
/*
* Check if this process has the right to modify the specified
@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
if (err)
goto out;
- if (nodes) {
- err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
- flags);
- } else {
- err = do_pages_stat(mm, nr_pages, pages, status);
- }
+ task_nodes = cpuset_mems_allowed(task);
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (mm) {
+ if (nodes)
+ err = do_pages_move(mm, task_nodes, nr_pages, pages,
+ nodes, status, flags);
+ else
+ err = do_pages_stat(mm, nr_pages, pages, status);
+ } else
+ err = -EINVAL;
-out:
mmput(mm);
return err;
+
+out:
+ put_task_struct(task);
+ return err;
}
/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff2..936b4cee8cb1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
}
/* fall through */
}
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
mincore_unmapped_range(vma, addr, next, vec);
else
mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6f3766b57803..a7bf6a31c9f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Helper for vma_adjust in the split_vma insert case:
- * insert vm structure into list and rbtree and anon_vma,
- * but it has already been inserted into prio_tree earlier.
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
+ * mm's list and rbtree. It has already been inserted into the prio_tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE);
if (IS_ERR(file))
return PTR_ERR(file);
}
@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
/*
* Is this a new hole at the lowest possible address?
*/
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
mm->free_area_cache = addr;
- mm->cached_hole_size = ~0UL;
- }
}
/*
@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
+ unsigned long addr = addr0, start_addr;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
mm->free_area_cache = mm->mmap_base;
}
+try_again:
/* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
- }
-
- if (mm->mmap_base < len)
- goto bottomup;
+ start_addr = addr = mm->free_area_cache;
- addr = mm->mmap_base-len;
+ if (addr < len)
+ goto fail;
+ addr -= len;
do {
/*
* Lookup failure means no vma is above this address,
@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = vma->vm_start-len;
} while (len < vma->vm_start);
-bottomup:
+fail:
+ /*
+ * if hint left us with no space for the requested
+ * mapping then try again:
+ *
+ * Note: this is different with the case of bottomup
+ * which does the fully line-search, but we use find_vma
+ * here that causes some holes skipped.
+ */
+ if (start_addr != mm->mmap_base) {
+ mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = 0;
+ goto try_again;
+ }
+
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080a..3dcfaf4ed355 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
- sync_mm_rss(tsk, mm);
+ sync_mm_rss(mm);
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 142ef4a1f480..a40992610ab6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
ptent = pte_mkwrite(ptent);
ptep_modify_prot_commit(mm, addr, pte, ptent);
- } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
+ } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2958fd8e7c9a..4198e000f41a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,7 @@
#include <linux/ptrace.h>
#include <linux/freezer.h>
#include <linux/ftrace.h>
+#include <linux/ratelimit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*/
static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long totalpages, struct mem_cgroup *memcg,
- const nodemask_t *nodemask)
+ const nodemask_t *nodemask, bool force_kill)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
if (unlikely(frozen(p)))
__thaw_task(p);
- return ERR_PTR(-1UL);
+ if (!force_kill)
+ return ERR_PTR(-1UL);
}
if (!p->mm)
continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
if (p == current) {
chosen = p;
*ppoints = 1000;
- } else {
+ } else if (!force_kill) {
/*
* If this task is not being ptraced on exit,
* then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
#define K(x) ((x) << (PAGE_SHIFT-10))
-static int oom_kill_task(struct task_struct *p)
-{
- struct task_struct *q;
- struct mm_struct *mm;
-
- p = find_lock_task_mm(p);
- if (!p)
- return 1;
-
- /* mm cannot be safely dereferenced after task_unlock(p) */
- mm = p->mm;
-
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
- task_pid_nr(p), p->comm, K(p->mm->total_vm),
- K(get_mm_counter(p->mm, MM_ANONPAGES)),
- K(get_mm_counter(p->mm, MM_FILEPAGES)));
- task_unlock(p);
-
- /*
- * Kill all user processes sharing p->mm in other thread groups, if any.
- * They don't get access to memory reserves or a higher scheduler
- * priority, though, to avoid depletion of all memory or task
- * starvation. This prevents mm->mmap_sem livelock when an oom killed
- * task cannot exit because it requires the semaphore and its contended
- * by another thread trying to allocate memory itself. That thread will
- * now get access to memory reserves since it has a pending fatal
- * signal.
- */
- for_each_process(q)
- if (q->mm == mm && !same_thread_group(q, p) &&
- !(q->flags & PF_KTHREAD)) {
- if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- continue;
-
- task_lock(q); /* Protect ->comm from prctl() */
- pr_err("Kill process %d (%s) sharing same memory\n",
- task_pid_nr(q), q->comm);
- task_unlock(q);
- force_sig(SIGKILL, q);
- }
-
- set_tsk_thread_flag(p, TIF_MEMDIE);
- force_sig(SIGKILL, p);
-
- return 0;
-}
-#undef K
-
-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *memcg, nodemask_t *nodemask,
+ const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t = p;
+ struct mm_struct *mm;
unsigned int victim_points = 0;
-
- if (printk_ratelimit())
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
- return 0;
+ return;
}
+ if (__ratelimit(&oom_rs))
+ dump_header(p, gfp_mask, order, memcg, nodemask);
+
task_lock(p);
pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
}
} while_each_thread(p, t);
- return oom_kill_task(victim);
+ victim = find_lock_task_mm(victim);
+ if (!victim)
+ return;
+
+ /* mm cannot safely be dereferenced after task_unlock(victim) */
+ mm = victim->mm;
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+ K(get_mm_counter(victim->mm, MM_ANONPAGES)),
+ K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+ task_unlock(victim);
+
+ /*
+ * Kill all user processes sharing victim->mm in other thread groups, if
+ * any. They don't get access to memory reserves, though, to avoid
+ * depletion of all memory. This prevents mm->mmap_sem livelock when an
+ * oom killed thread cannot exit because it requires the semaphore and
+ * its contended by another thread trying to allocate memory itself.
+ * That thread will now get access to memory reserves since it has a
+ * pending fatal signal.
+ */
+ for_each_process(p)
+ if (p->mm == mm && !same_thread_group(p, victim) &&
+ !(p->flags & PF_KTHREAD)) {
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ continue;
+
+ task_lock(p); /* Protect ->comm from prctl() */
+ pr_err("Kill process %d (%s) sharing same memory\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
+ force_sig(SIGKILL, p);
+ }
+
+ set_tsk_thread_flag(victim, TIF_MEMDIE);
+ force_sig(SIGKILL, victim);
}
+#undef K
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ int order)
{
unsigned long limit;
unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
return;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
-retry:
- p = select_bad_process(&points, limit, memcg, NULL);
- if (!p || PTR_ERR(p) == -1UL)
- goto out;
-
- if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
- "Memory cgroup out of memory"))
- goto retry;
-out:
+ p = select_bad_process(&points, limit, memcg, NULL, false);
+ if (p && PTR_ERR(p) != -1UL)
+ oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
+ "Memory cgroup out of memory");
read_unlock(&tasklist_lock);
}
#endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
* @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
* don't have to be perfect here, we just have to be good.
*/
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask)
+ int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
current->mm) {
- /*
- * oom_kill_process() needs tasklist_lock held. If it returns
- * non-zero, current could not be killed so we must fallback to
- * the tasklist scan.
- */
- if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
- NULL, nodemask,
- "Out of memory (oom_kill_allocating_task)"))
- goto out;
- }
-
-retry:
- p = select_bad_process(&points, totalpages, NULL, mpol_mask);
- if (PTR_ERR(p) == -1UL)
+ oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
+ nodemask,
+ "Out of memory (oom_kill_allocating_task)");
goto out;
+ }
+ p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+ force_kill);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
-
- if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory"))
- goto retry;
- killed = 1;
+ if (PTR_ERR(p) != -1UL) {
+ oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+ nodemask, "Out of memory");
+ killed = 1;
+ }
out:
read_unlock(&tasklist_lock);
@@ -792,7 +774,7 @@ out:
void pagefault_out_of_memory(void)
{
if (try_set_system_oom()) {
- out_of_memory(NULL, 0, 0, NULL);
+ out_of_memory(NULL, 0, 0, NULL, false);
clear_system_oom();
}
if (!test_thread_flag(TIF_MEMDIE))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 363ba7082ef5..3fc261705b1e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
for ( ; ; ) {
global_dirty_limits(&background_thresh, &dirty_thresh);
+ dirty_thresh = hard_dirty_limit(dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a13ded1938f0..caea788628e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order, nodemask);
+ out_of_memory(zonelist, gfp_mask, order, nodemask, false);
out:
clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
if (!order)
return NULL;
- if (compaction_deferred(preferred_zone)) {
+ if (compaction_deferred(preferred_zone, order)) {
*deferred_compaction = true;
return NULL;
}
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
if (page) {
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
+ if (order >= preferred_zone->compact_order_failed)
+ preferred_zone->compact_order_failed = order + 1;
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* defer if the failure was a sync compaction failure.
*/
if (sync_migration)
- defer_compaction(preferred_zone);
+ defer_compaction(preferred_zone, order);
cond_resched();
}
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
- struct page *page;
+ struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
- if (!preferred_zone) {
- put_mems_allowed();
- return NULL;
- }
+ if (!preferred_zone)
+ goto out;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+ /*
+ * When updating a task's mems_allowed, it is possible to race with
+ * parallel threads in such a way that an allocation can fail while
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
+ unsigned int cpuset_mems_cookie;
if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;
- get_mems_allowed();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- put_mems_allowed();
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ } while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
}
}
-int __init add_from_early_node_map(struct range *range, int az,
- int nr_range, int nid)
-{
- unsigned long start_pfn, end_pfn;
- int i;
-
- /* need to go over early_node_map to find out good range for node */
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
- nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
- return nr_range;
-}
-
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
{
int i, nid;
unsigned long usable_startpfn;
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+ find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ lru_add_drain_cpu(cpu);
drain_pages(cpu);
/*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff660..aa9701e12714 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
continue;
split_huge_page_pmd(walk->mm, pmd);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb663fb533e0..5a74fea182f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
int young;
-#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#else
BUG();
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
young = pmdp_test_and_clear_young(vma, address, pmdp);
if (young)
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e06b6c8..5b5ad584ffb7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
+{
+ avc->vma = vma;
+ avc->anon_vma = anon_vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+
+ /*
+ * It's critical to add new vmas to the tail of the anon_vma,
+ * see comment in huge_memory.c:__split_huge_page().
+ */
+ list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+}
+
/**
* anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- avc->anon_vma = anon_vma;
- avc->vma = vma;
- list_add(&avc->same_vma, &vma->anon_vma_chain);
- list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ anon_vma_chain_link(vma, avc, anon_vma);
allocated = NULL;
avc = NULL;
}
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
mutex_unlock(&root->mutex);
}
-static void anon_vma_chain_link(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
-{
- avc->vma = vma;
- avc->anon_vma = anon_vma;
- list_add(&avc->same_vma, &vma->anon_vma_chain);
-
- /*
- * It's critical to add new vmas to the tail of the anon_vma,
- * see comment in huge_memory.c:__split_huge_page().
- */
- list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-}
-
/*
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
+ bool locked;
+ unsigned long flags;
+
+ mem_cgroup_begin_update_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
/**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ bool anon = PageAnon(page);
+ bool locked;
+ unsigned long flags;
+
+ /*
+ * The anon case has no mem_cgroup page_stat to update; but may
+ * uncharge_page() below, where the lock ordering can deadlock if
+ * we hold the lock against page_stat move: so avoid it on anon.
+ */
+ if (!anon)
+ mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ goto out;
/*
* Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
*/
- if ((!PageAnon(page) || PageSwapCache(page)) &&
+ if ((!anon || PageSwapCache(page)) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
* and not charged by memcg for now.
*/
if (unlikely(PageHuge(page)))
- return;
- if (PageAnon(page)) {
+ goto out;
+ if (anon) {
mem_cgroup_uncharge_page(page);
if (!PageTransHuge(page))
__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+out:
+ if (!anon)
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
/*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
- } else if (PAGE_MIGRATION) {
+ } else if (IS_ENABLED(CONFIG_MIGRATION)) {
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
+ } else if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (TTU_ACTION(flags) == TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
is_vma_temporary_stack(vma))
continue;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7a45ad004cfd..f99ff3e50bd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#else
+#define shmem_initxattrs NULL
+#endif
+
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
if (inode) {
error = security_inode_init_security(inode, dir,
&dentry->d_name,
- NULL, NULL);
+ shmem_initxattrs, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return -ENOSPC;
error = security_inode_init_security(inode, dir, &dentry->d_name,
- NULL, NULL);
+ shmem_initxattrs, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
* filesystem level, though.
*/
+/*
+ * Allocate new xattr and copy in the value; but leave the name to callers.
+ */
+static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
+{
+ struct shmem_xattr *new_xattr;
+ size_t len;
+
+ /* wrap around? */
+ len = sizeof(*new_xattr) + size;
+ if (len <= sizeof(*new_xattr))
+ return NULL;
+
+ new_xattr = kmalloc(len, GFP_KERNEL);
+ if (!new_xattr)
+ return NULL;
+
+ new_xattr->size = size;
+ memcpy(new_xattr->value, value, size);
+ return new_xattr;
+}
+
+/*
+ * Callback for security_inode_init_security() for acquiring xattrs.
+ */
+static int shmem_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ const struct xattr *xattr;
+ struct shmem_xattr *new_xattr;
+ size_t len;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+ if (!new_xattr)
+ return -ENOMEM;
+
+ len = strlen(xattr->name) + 1;
+ new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+ GFP_KERNEL);
+ if (!new_xattr->name) {
+ kfree(new_xattr);
+ return -ENOMEM;
+ }
+
+ memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
+ memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+ xattr->name, len);
+
+ spin_lock(&info->lock);
+ list_add(&new_xattr->list, &info->xattr_list);
+ spin_unlock(&info->lock);
+ }
+
+ return 0;
+}
+
static int shmem_xattr_get(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
return ret;
}
-static int shmem_xattr_set(struct dentry *dentry, const char *name,
+static int shmem_xattr_set(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_xattr *xattr;
struct shmem_xattr *new_xattr = NULL;
- size_t len;
int err = 0;
/* value == NULL means remove */
if (value) {
- /* wrap around? */
- len = sizeof(*new_xattr) + size;
- if (len <= sizeof(*new_xattr))
- return -ENOMEM;
-
- new_xattr = kmalloc(len, GFP_KERNEL);
+ new_xattr = shmem_xattr_alloc(value, size);
if (!new_xattr)
return -ENOMEM;
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
kfree(new_xattr);
return -ENOMEM;
}
-
- new_xattr->size = size;
- memcpy(new_xattr->value, value, size);
}
spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
if (size == 0)
value = ""; /* empty EA, do not remove */
- return shmem_xattr_set(dentry, name, value, size, flags);
+ return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
}
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
if (err)
return err;
- return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+ return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
}
static bool xattr_is_trusted(const char *name)
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3b..29c8716eb7a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_mem_id();
- get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
- put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
+ unsigned int cpuset_mems_cookie;
if (flags & __GFP_THISNODE)
return NULL;
- get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
retry:
/*
* Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
}
}
}
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ goto retry_cpuset;
return obj;
}
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..f4a6229848fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
void *object;
+ unsigned int cpuset_mems_cookie;
/*
* The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
- get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- struct kmem_cache_node *n;
-
- n = get_node(s, zone_to_nid(zone));
-
- if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
- n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, c);
- if (object) {
- put_mems_allowed();
- return object;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ n->nr_partial > s->min_partial) {
+ object = get_partial_node(s, n, c);
+ if (object) {
+ /*
+ * Return the object even if
+ * put_mems_allowed indicated that
+ * the cpuset mems_allowed was
+ * updated in parallel. It's a
+ * harmless race between the alloc
+ * and the cpuset update.
+ */
+ put_mems_allowed(cpuset_mems_cookie);
+ return object;
+ }
}
}
- }
- put_mems_allowed();
+ } while (!put_mems_allowed(cpuset_mems_cookie));
#endif
return NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 61d7cde23111..a8bc7d364deb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
usemap_count);
- if (usemap) {
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- usemap_map[pnum] = usemap;
- usemap += size;
+ if (!usemap) {
+ usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+ if (!usemap) {
+ printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ return;
}
- return;
}
- usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
- if (usemap) {
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- usemap_map[pnum] = usemap;
- usemap += size;
- check_usemap_section_nr(nodeid, usemap_map[pnum]);
- }
- return;
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ check_usemap_section_nr(nodeid, usemap_map[pnum]);
}
-
- printk(KERN_WARNING "%s: allocation failed\n", __func__);
}
#ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index 14380e9fbe33..5c13f1338972 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
*/
-static void drain_cpu_pagevecs(int cpu)
+void lru_add_drain_cpu(int cpu)
{
struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
void lru_add_drain(void)
{
- drain_cpu_pagevecs(get_cpu());
+ lru_add_drain_cpu(get_cpu());
put_cpu();
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6b32d61873..9d3dd3763cf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
- int nr_pages;
struct page *page;
- unsigned long offset;
- unsigned long end_offset;
+ unsigned long offset = swp_offset(entry);
+ unsigned long start_offset, end_offset;
+ unsigned long mask = (1UL << page_cluster) - 1;
- /*
- * Get starting offset for readaround, and number of pages to read.
- * Adjust starting address by readbehind (for NUMA interleave case)?
- * No, it's very unlikely that swap layout would follow vma layout,
- * more likely that neighbouring swap pages came from the same node:
- * so use the same "addr" to choose the same node for each swap read.
- */
- nr_pages = valid_swaphandles(entry, &offset);
- for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+ /* Read a page_cluster sized and aligned cluster around offset. */
+ start_offset = offset & ~mask;
+ end_offset = offset | mask;
+ if (!start_offset) /* First page is swap header. */
+ start_offset++;
+
+ for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
gfp_mask, vma, addr);
if (!page)
- break;
+ continue;
page_cache_release(page);
}
lru_add_drain(); /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6bf67ab6e469..dae42f380d6e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (unlikely(pmd_trans_huge(*pmd)))
- continue;
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
if (ret)
@@ -2107,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SOLIDSTATE;
p->cluster_next = 1 + (random32() % p->highest_bit);
}
- if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
+ if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
p->flags |= SWP_DISCARDABLE;
}
@@ -2292,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry)
}
/*
- * swap_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
- struct swap_info_struct *si;
- int our_page_cluster = page_cluster;
- pgoff_t target, toff;
- pgoff_t base, end;
- int nr_pages = 0;
-
- if (!our_page_cluster) /* no readahead */
- return 0;
-
- si = swap_info[swp_type(entry)];
- target = swp_offset(entry);
- base = (target >> our_page_cluster) << our_page_cluster;
- end = base + (1 << our_page_cluster);
- if (!base) /* first page is swap header */
- base++;
-
- spin_lock(&swap_lock);
- if (end > si->max) /* don't go beyond end of map */
- end = si->max;
-
- /* Count contiguous allocated slots above our target */
- for (toff = target; ++toff < end; nr_pages++) {
- /* Don't read in free or bad pages */
- if (!si->swap_map[toff])
- break;
- if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
- break;
- }
- /* Count contiguous allocated slots below our target */
- for (toff = target; --toff >= base; nr_pages++) {
- /* Don't read in free or bad pages */
- if (!si->swap_map[toff])
- break;
- if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
- break;
- }
- spin_unlock(&swap_lock);
-
- /*
- * Indicate starting offset, and return number of pages to get:
- * if only 1, say 0, since there's then no readahead to be done.
- */
- *offset = ++toff;
- return nr_pages? ++nr_pages: 0;
-}
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
diff --git a/mm/util.c b/mm/util.c
index 136ac4f322b8..ae962b31de88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
next->vm_prev = vma;
}
+/* Check if the vma is being used as a stack by this task */
+static int vm_is_stack_for_task(struct task_struct *t,
+ struct vm_area_struct *vma)
+{
+ return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
+}
+
+/*
+ * Check if the vma is being used as a stack.
+ * If is_group is non-zero, check in the entire thread group or else
+ * just check in the current task. Returns the pid of the task that
+ * the vma is stack for.
+ */
+pid_t vm_is_stack(struct task_struct *task,
+ struct vm_area_struct *vma, int in_group)
+{
+ pid_t ret = 0;
+
+ if (vm_is_stack_for_task(task, vma))
+ return task->pid;
+
+ if (in_group) {
+ struct task_struct *t;
+ rcu_read_lock();
+ if (!pid_alive(task))
+ goto done;
+
+ t = task;
+ do {
+ if (vm_is_stack_for_task(t, vma)) {
+ ret = t->pid;
+ goto done;
+ }
+ } while_each_thread(task, t);
+done:
+ rcu_read_unlock();
+ }
+
+ return ret;
+}
+
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b23552659..49f15ef0a99a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
* @mz: The mem_cgroup_zone to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
- * @order: The caller's attempted allocation order
+ * @sc: The scan_control struct for this reclaim session
* @mode: One of the LRU isolation modes
* @active: True [1] if isolating active pages
* @file: True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct mem_cgroup_zone *mz, struct list_head *dst,
- unsigned long *nr_scanned, int order, isolate_mode_t mode,
- int active, int file)
+ unsigned long *nr_scanned, struct scan_control *sc,
+ isolate_mode_t mode, int active, int file)
{
struct lruvec *lruvec;
struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
BUG();
}
- if (!order)
+ if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
continue;
/*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*/
zone_id = page_zone_id(page);
page_pfn = page_to_pfn(page);
- pfn = page_pfn & ~((1 << order) - 1);
- end_pfn = pfn + (1 << order);
+ pfn = page_pfn & ~((1 << sc->order) - 1);
+ end_pfn = pfn + (1 << sc->order);
for (; pfn < end_pfn; pfn++) {
struct page *cursor_page;
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*nr_scanned = scan;
- trace_mm_vmscan_lru_isolate(order,
+ trace_mm_vmscan_lru_isolate(sc->order,
nr_to_scan, scan,
nr_taken,
nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
unsigned long *nr_anon,
unsigned long *nr_file)
{
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
struct zone *zone = mz->zone;
unsigned int count[NR_LRU_LISTS] = { 0, };
unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
count[lru] += numpages;
}
+ preempt_disable();
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- reclaim_stat->recent_scanned[0] += *nr_anon;
- reclaim_stat->recent_scanned[1] += *nr_file;
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+ preempt_enable();
}
/*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
unsigned long nr_file;
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
- isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+ isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
struct zone *zone = mz->zone;
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
set_reclaim_mode(priority, sc, false);
if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
- reclaim_mode |= ISOLATE_ACTIVE;
+ isolate_mode |= ISOLATE_ACTIVE;
lru_add_drain();
if (!sc->may_unmap)
- reclaim_mode |= ISOLATE_UNMAPPED;
+ isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
- reclaim_mode |= ISOLATE_CLEAN;
+ isolate_mode |= ISOLATE_CLEAN;
spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
- &nr_scanned, sc->order,
- reclaim_mode, 0, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
+ sc, isolate_mode, 0, file);
if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
}
+ spin_unlock_irq(&zone->lru_lock);
- if (nr_taken == 0) {
- spin_unlock_irq(&zone->lru_lock);
+ if (nr_taken == 0)
return 0;
- }
update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-
- spin_unlock_irq(&zone->lru_lock);
-
nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
&nr_dirty, &nr_writeback);
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
spin_lock_irq(&zone->lru_lock);
+ reclaim_stat->recent_scanned[0] += nr_anon;
+ reclaim_stat->recent_scanned[1] += nr_file;
+
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
unsigned long pgmoved = 0;
struct page *page;
- if (buffer_heads_over_limit) {
- spin_unlock_irq(&zone->lru_lock);
- list_for_each_entry(page, list, lru) {
- if (page_has_private(page) && trylock_page(page)) {
- if (page_has_private(page))
- try_to_release_page(page, 0);
- unlock_page(page);
- }
- }
- spin_lock_irq(&zone->lru_lock);
- }
-
while (!list_empty(list)) {
struct lruvec *lruvec;
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
unsigned long nr_rotated = 0;
- isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+ isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
struct zone *zone = mz->zone;
lru_add_drain();
+ reset_reclaim_mode(sc);
+
if (!sc->may_unmap)
- reclaim_mode |= ISOLATE_UNMAPPED;
+ isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
- reclaim_mode |= ISOLATE_CLEAN;
+ isolate_mode |= ISOLATE_CLEAN;
spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
- &nr_scanned, sc->order,
- reclaim_mode, 1, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
+ isolate_mode, 1, file);
if (global_reclaim(sc))
zone->pages_scanned += nr_scanned;
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
continue;
}
+ if (unlikely(buffer_heads_over_limit)) {
+ if (page_has_private(page) && trylock_page(page)) {
+ if (page_has_private(page))
+ try_to_release_page(page, 0);
+ unlock_page(page);
+ }
+ }
+
if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
@@ -2112,7 +2107,12 @@ restart:
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+ if (nr_reclaimed >= nr_to_reclaim)
+ nr_to_reclaim = 0;
+ else
+ nr_to_reclaim -= nr_reclaimed;
+
+ if (!nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
- if (compaction_deferred(zone))
+ if (compaction_deferred(zone, sc->order))
return watermark_ok;
/* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
unsigned long nr_soft_scanned;
bool aborted_reclaim = false;
+ /*
+ * If the number of buffer_heads in the machine exceeds the maximum
+ * allowed level, force direct reclaim to scan the highmem zone as
+ * highmem pages could be pinning lowmem pages storing buffer_heads
+ */
+ if (buffer_heads_over_limit)
+ sc->gfp_mask |= __GFP_HIGHMEM;
+
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
* Even though compaction is invoked for any
* non-zero order, only frequent costly order
* reclamation is disruptive enough to become a
- * noticable problem, like transparent huge page
- * allocations.
+ * noticeable problem, like transparent huge
+ * page allocations.
*/
if (compaction_ready(zone, sc)) {
aborted_reclaim = true;
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
unsigned long writeback_threshold;
bool aborted_reclaim;
- get_mems_allowed();
delayacct_freepages_start();
if (global_reclaim(sc))
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
out:
delayacct_freepages_end();
- put_mems_allowed();
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
@@ -2724,6 +2730,17 @@ loop_again:
*/
age_active_anon(zone, &sc, priority);
+ /*
+ * If the number of buffer_heads in the machine
+ * exceeds the maximum allowed level and this node
+ * has a highmem zone, force kswapd to reclaim from
+ * it to relieve lowmem pressure.
+ */
+ if (buffer_heads_over_limit && is_highmem_idx(i)) {
+ end_zone = i;
+ break;
+ }
+
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
@@ -2753,7 +2770,7 @@ loop_again:
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- int nr_slab;
+ int nr_slab, testorder;
unsigned long balance_gap;
if (!populated_zone(zone))
@@ -2786,7 +2803,21 @@ loop_again:
(zone->present_pages +
KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
- if (!zone_watermark_ok_safe(zone, order,
+ /*
+ * Kswapd reclaims only single pages with compaction
+ * enabled. Trying too hard to reclaim until contiguous
+ * free pages have become available can hurt performance
+ * by evicting too much useful data from memory.
+ * Do not reclaim more than needed for compaction.
+ */
+ testorder = order;
+ if (COMPACTION_BUILD && order &&
+ compaction_suitable(zone, order) !=
+ COMPACT_SKIPPED)
+ testorder = 0;
+
+ if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+ !zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone) + balance_gap,
end_zone, 0)) {
shrink_zone(priority, zone, &sc);
@@ -2815,7 +2846,7 @@ loop_again:
continue;
}
- if (!zone_watermark_ok_safe(zone, order,
+ if (!zone_watermark_ok_safe(zone, testorder,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2903,6 +2934,8 @@ out:
* and it is potentially going to sleep here.
*/
if (order) {
+ int zones_need_compaction = 1;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
@@ -2912,6 +2945,10 @@ out:
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
+ /* Would compaction fail due to lack of free memory? */
+ if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
+ goto loop_again;
+
/* Confirm the zone is balanced for order-0 */
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone), 0, 0)) {
@@ -2919,11 +2956,17 @@ out:
goto loop_again;
}
+ /* Check if the memory needs to be defragmented. */
+ if (zone_watermark_ok(zone, order,
+ low_wmark_pages(zone), *classzone_idx, 0))
+ zones_need_compaction = 0;
+
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= *classzone_idx)
- balanced += zone->present_pages;
}
+
+ if (zones_need_compaction)
+ compact_pgdat(pgdat, order);
}
/*